mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	added the second sharegpt format
Former-commit-id: 6d140ac98a78ecc0a713842bb917dc8eb14450cb
This commit is contained in:
		
							parent
							
								
									3cef844079
								
							
						
					
					
						commit
						dfd153cc81
					
				@ -94,20 +94,44 @@ Remember to set `"ranking": true` for the preference datasets.
 | 
			
		||||
The dataset in sharegpt format should follow the below format:
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
# The first sharegpt format
 | 
			
		||||
[
 | 
			
		||||
  {
 | 
			
		||||
    "conversations": [
 | 
			
		||||
      {
 | 
			
		||||
        "from": "human",
 | 
			
		||||
        "value": "user instruction"
 | 
			
		||||
        "value": "用户指令"
 | 
			
		||||
      },
 | 
			
		||||
      {
 | 
			
		||||
        "from": "gpt",
 | 
			
		||||
        "value": "model response"
 | 
			
		||||
        "value": "模型回答"
 | 
			
		||||
      }
 | 
			
		||||
    ],
 | 
			
		||||
    "system": "system prompt (optional)",
 | 
			
		||||
    "tools": "tool description (optional)"
 | 
			
		||||
    "system": "系统提示词(选填)",
 | 
			
		||||
    "tools": "工具描述(选填)"
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
# The second sharegpt format
 | 
			
		||||
 | 
			
		||||
[
 | 
			
		||||
  {
 | 
			
		||||
    "type": "chatml",
 | 
			
		||||
    "messages": [
 | 
			
		||||
    {
 | 
			
		||||
      "role": "system",
 | 
			
		||||
      "content": "You are a helpful assistant."
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
      "role": "user",
 | 
			
		||||
      "content": "Tell me something about large language models."
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
      "role": "assistant",
 | 
			
		||||
      "content": "Large language models are a type of language model  ..."
 | 
			
		||||
    }
 | 
			
		||||
  ],
 | 
			
		||||
  "source": "unknown"
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -37,7 +37,7 @@
 | 
			
		||||
 | 
			
		||||
----
 | 
			
		||||
 | 
			
		||||
该项目目前支持两种格式的数据集:**alpaca** 和 **sharegpt**,其中 alpaca 格式的数据集按照以下方式组织:
 | 
			
		||||
该项目目前支持三种格式的数据集:**alpaca** 和 **sharegpt**,其中 alpaca 格式的数据集按照以下方式组织:
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
[
 | 
			
		||||
@ -94,6 +94,7 @@
 | 
			
		||||
而 sharegpt 格式的数据集按照以下方式组织:
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
# 第一种sharegpt格式
 | 
			
		||||
[
 | 
			
		||||
  {
 | 
			
		||||
    "conversations": [
 | 
			
		||||
@ -110,6 +111,29 @@
 | 
			
		||||
    "tools": "工具描述(选填)"
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
# 第二种sharegpt格式
 | 
			
		||||
 | 
			
		||||
[
 | 
			
		||||
  {
 | 
			
		||||
    "type": "chatml",
 | 
			
		||||
    "messages": [
 | 
			
		||||
    {
 | 
			
		||||
      "role": "system",
 | 
			
		||||
      "content": "You are a helpful assistant."
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
      "role": "user",
 | 
			
		||||
      "content": "Tell me something about large language models."
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
      "role": "assistant",
 | 
			
		||||
      "content": "Large language models are a type of language model  ..."
 | 
			
		||||
    }
 | 
			
		||||
  ],
 | 
			
		||||
  "source": "unknown"
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
对于上述格式的数据,`dataset_info.json` 中的 `columns` 应为:
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user