mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	update data readme
Former-commit-id: 81adb153b7d0b30e6cd50c9bf4ca1ccf17458611
This commit is contained in:
		
							parent
							
								
									72222d1598
								
							
						
					
					
						commit
						4d35ace75e
					
				@ -23,6 +23,7 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
 | 
			
		||||
    "system": "the column name in the dataset containing the system prompts. (default: None)",
 | 
			
		||||
    "tools": "the column name in the dataset containing the tool description. (default: None)",
 | 
			
		||||
    "images": "the column name in the dataset containing the image inputs. (default: None)",
 | 
			
		||||
    "videos": "the column name in the dataset containing the videos inputs. (default: None)",
 | 
			
		||||
    "chosen": "the column name in the dataset containing the chosen answers. (default: None)",
 | 
			
		||||
    "rejected": "the column name in the dataset containing the rejected answers. (default: None)",
 | 
			
		||||
    "kto_tag": "the column name in the dataset containing the kto tags. (default: None)"
 | 
			
		||||
@ -168,11 +169,11 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Multimodal Dataset
 | 
			
		||||
### Multimodal Image Dataset
 | 
			
		||||
 | 
			
		||||
- [Example dataset](mllm_demo.json)
 | 
			
		||||
 | 
			
		||||
Multimodal datasets require a `images` column containing the paths to the input images.
 | 
			
		||||
Multimodal image datasets require a `images` column containing the paths to the input images.
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
[
 | 
			
		||||
@ -201,6 +202,39 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Multimodal Video Dataset
 | 
			
		||||
 | 
			
		||||
- [Example dataset](mllm_demo_video.json)
 | 
			
		||||
 | 
			
		||||
Multimodal video datasets require a `videos` column containing the paths to the input videos.
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
[
 | 
			
		||||
  {
 | 
			
		||||
    "instruction": "human instruction (required)",
 | 
			
		||||
    "input": "human input (optional)",
 | 
			
		||||
    "output": "model response (required)",
 | 
			
		||||
    "videos": [
 | 
			
		||||
      "video path (required)"
 | 
			
		||||
    ]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
"dataset_name": {
 | 
			
		||||
  "file_name": "data.json",
 | 
			
		||||
  "columns": {
 | 
			
		||||
    "prompt": "instruction",
 | 
			
		||||
    "query": "input",
 | 
			
		||||
    "response": "output",
 | 
			
		||||
    "videos": "videos"
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Sharegpt Format
 | 
			
		||||
 | 
			
		||||
### Supervised Fine-Tuning Dataset
 | 
			
		||||
 | 
			
		||||
@ -23,6 +23,7 @@
 | 
			
		||||
    "system": "数据集代表系统提示的表头名称(默认:None)",
 | 
			
		||||
    "tools": "数据集代表工具描述的表头名称(默认:None)",
 | 
			
		||||
    "images": "数据集代表图像输入的表头名称(默认:None)",
 | 
			
		||||
    "videos": "数据集代表视频输入的表头名称(默认:None)",
 | 
			
		||||
    "chosen": "数据集代表更优回答的表头名称(默认:None)",
 | 
			
		||||
    "rejected": "数据集代表更差回答的表头名称(默认:None)",
 | 
			
		||||
    "kto_tag": "数据集代表 KTO 标签的表头名称(默认:None)"
 | 
			
		||||
@ -168,11 +169,11 @@ KTO 数据集需要额外添加一个 `kto_tag` 列,包含 bool 类型的人
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 多模态数据集
 | 
			
		||||
### 多模态图像数据集
 | 
			
		||||
 | 
			
		||||
- [样例数据集](mllm_demo.json)
 | 
			
		||||
 | 
			
		||||
多模态数据集需要额外添加一个 `images` 列,包含输入图像的路径。
 | 
			
		||||
多模态图像数据集需要额外添加一个 `images` 列,包含输入图像的路径。
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
[
 | 
			
		||||
@ -201,6 +202,39 @@ KTO 数据集需要额外添加一个 `kto_tag` 列,包含 bool 类型的人
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### 多模态视频数据集
 | 
			
		||||
 | 
			
		||||
- [样例数据集](mllm_demo_video.json)
 | 
			
		||||
 | 
			
		||||
多模态视频数据集需要额外添加一个 `videos` 列,包含输入视频的路径。
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
[
 | 
			
		||||
  {
 | 
			
		||||
    "instruction": "人类指令(必填)",
 | 
			
		||||
    "input": "人类输入(选填)",
 | 
			
		||||
    "output": "模型回答(必填)",
 | 
			
		||||
    "videos": [
 | 
			
		||||
      "视频路径(必填)"
 | 
			
		||||
    ]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
"数据集名称": {
 | 
			
		||||
  "file_name": "data.json",
 | 
			
		||||
  "columns": {
 | 
			
		||||
    "prompt": "instruction",
 | 
			
		||||
    "query": "input",
 | 
			
		||||
    "response": "output",
 | 
			
		||||
    "videos": "videos"
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Sharegpt 格式
 | 
			
		||||
 | 
			
		||||
### 指令监督微调数据集
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user