mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	Slim Orca data parsing
Former-commit-id: f2d8efede7e20edafed0d5446eb64f2d419949b1
This commit is contained in:
		
							parent
							
								
									4d473894fd
								
							
						
					
					
						commit
						1ce82f391a
					
				@ -26,7 +26,8 @@ If you are using a custom dataset, please provide your dataset definition in the
 | 
			
		||||
    "user_tag": "the value of the role_tag represents the user. (default: human)",
 | 
			
		||||
    "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
 | 
			
		||||
    "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
 | 
			
		||||
    "function_tag": "the value of the role_tag represents the function call. (default: function_call)"
 | 
			
		||||
    "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
 | 
			
		||||
    "system_tag": "the value of the role_tag represents the system prompt. (default: None) incompatible with system column"
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -47,10 +47,10 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
 | 
			
		||||
        dataset_attr.assistant_tag: Role.ASSISTANT,
 | 
			
		||||
        dataset_attr.observation_tag: Role.OBSERVATION,
 | 
			
		||||
        dataset_attr.function_tag: Role.FUNCTION,
 | 
			
		||||
        dataset_attr.system_tag: Role.SYSTEM,
 | 
			
		||||
    }
 | 
			
		||||
    for i, messages in enumerate(examples[dataset_attr.messages]):
 | 
			
		||||
        messages = messages[: len(messages) // 2 * 2]  # should be multiples of 2
 | 
			
		||||
        if len(messages) == 0:
 | 
			
		||||
        if len(messages) <= 1:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        prompt = []
 | 
			
		||||
@ -74,6 +74,9 @@ def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr"
 | 
			
		||||
                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if len(prompt) % 2 == 1:
 | 
			
		||||
            # Last message was neither from assistant nor function
 | 
			
		||||
            prompt.pop(-1)
 | 
			
		||||
        last_message = prompt.pop(-1)
 | 
			
		||||
        response.append(last_message)
 | 
			
		||||
        outputs["prompt"].append(prompt)
 | 
			
		||||
 | 
			
		||||
@ -21,6 +21,7 @@ class Role(str, Enum):
 | 
			
		||||
    ASSISTANT = "assistant"
 | 
			
		||||
    OBSERVATION = "observation"
 | 
			
		||||
    FUNCTION = "function"
 | 
			
		||||
    SYSTEM = "system"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user