mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	add dpo mix dataset
Former-commit-id: 6def3f8bfa51b2d9d73af112352ce07db972e4c9
This commit is contained in:
		
							parent
							
								
									b3b5b530d1
								
							
						
					
					
						commit
						0cb596fee1
					
				@ -1,5 +1,6 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -22,31 +23,19 @@ _URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
 | 
			
		||||
    VERSION = datasets.Version("0.0.0")
 | 
			
		||||
 | 
			
		||||
    def _info(self):
 | 
			
		||||
        features = datasets.Features({
 | 
			
		||||
            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
 | 
			
		||||
        })
 | 
			
		||||
        features = datasets.Features(
 | 
			
		||||
            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
 | 
			
		||||
        )
 | 
			
		||||
        return datasets.DatasetInfo(
 | 
			
		||||
            description=_DESCRIPTION,
 | 
			
		||||
            features=features,
 | 
			
		||||
            homepage=_HOMEPAGE,
 | 
			
		||||
            license=_LICENSE,
 | 
			
		||||
            citation=_CITATION
 | 
			
		||||
            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _split_generators(self, dl_manager: datasets.DownloadManager):
 | 
			
		||||
        file_path = dl_manager.download(_URL)
 | 
			
		||||
        return [
 | 
			
		||||
            datasets.SplitGenerator(
 | 
			
		||||
                name=datasets.Split.TRAIN,
 | 
			
		||||
                gen_kwargs={
 | 
			
		||||
                    "filepath": file_path
 | 
			
		||||
                }
 | 
			
		||||
            )
 | 
			
		||||
        ]
 | 
			
		||||
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
 | 
			
		||||
 | 
			
		||||
    def _generate_examples(self, filepath: str):
 | 
			
		||||
        with open(filepath, "r", encoding="utf-8") as f:
 | 
			
		||||
@ -58,7 +47,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
 | 
			
		||||
                assist_idx = prompt.rfind("Assistant:")
 | 
			
		||||
                human_idx = prompt.rfind("Human:")
 | 
			
		||||
                query = prompt[human_idx+6:assist_idx].strip()
 | 
			
		||||
                query = prompt[human_idx + 6 : assist_idx].strip()
 | 
			
		||||
                prompt = prompt[:human_idx].strip()
 | 
			
		||||
                conversations.insert(0, {"from": "gpt", "value": response})
 | 
			
		||||
                conversations.insert(0, {"from": "human", "value": query})
 | 
			
		||||
@ -67,8 +56,8 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
                    assist_idx = prompt.rfind("Assistant:")
 | 
			
		||||
                    human_idx = prompt.rfind("Human:")
 | 
			
		||||
                    if human_idx != -1:
 | 
			
		||||
                        old_query = prompt[human_idx+6:assist_idx].strip()
 | 
			
		||||
                        old_resp = prompt[assist_idx+10:].strip()
 | 
			
		||||
                        old_query = prompt[human_idx + 6 : assist_idx].strip()
 | 
			
		||||
                        old_resp = prompt[assist_idx + 10 :].strip()
 | 
			
		||||
                        conversations.insert(0, {"from": "gpt", "value": old_resp})
 | 
			
		||||
                        conversations.insert(0, {"from": "human", "value": old_query})
 | 
			
		||||
                    else:
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,8 @@
 | 
			
		||||
import json
 | 
			
		||||
import datasets
 | 
			
		||||
from typing import Any, Dict, Generator, List, Tuple
 | 
			
		||||
 | 
			
		||||
import datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_DESCRIPTION = "An example of dataset."
 | 
			
		||||
_CITATION = ""
 | 
			
		||||
@ -11,34 +12,24 @@ _URL = "examples.json"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ExampleDataset(datasets.GeneratorBasedBuilder):
 | 
			
		||||
 | 
			
		||||
    VERSION = datasets.Version("0.0.0")
 | 
			
		||||
 | 
			
		||||
    def _info(self) -> datasets.DatasetInfo:
 | 
			
		||||
        features = datasets.Features({
 | 
			
		||||
            "instruction": datasets.Value("string"),
 | 
			
		||||
            "input": datasets.Value("string"),
 | 
			
		||||
            "output": datasets.Value("string"),
 | 
			
		||||
            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
 | 
			
		||||
        })
 | 
			
		||||
        features = datasets.Features(
 | 
			
		||||
            {
 | 
			
		||||
                "instruction": datasets.Value("string"),
 | 
			
		||||
                "input": datasets.Value("string"),
 | 
			
		||||
                "output": datasets.Value("string"),
 | 
			
		||||
                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
        return datasets.DatasetInfo(
 | 
			
		||||
            description=_DESCRIPTION,
 | 
			
		||||
            features=features,
 | 
			
		||||
            homepage=_HOMEPAGE,
 | 
			
		||||
            license=_LICENSE,
 | 
			
		||||
            citation=_CITATION
 | 
			
		||||
            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
 | 
			
		||||
        file_path = dl_manager.download(_URL)
 | 
			
		||||
        return [
 | 
			
		||||
            datasets.SplitGenerator(
 | 
			
		||||
                name=datasets.Split.TRAIN,
 | 
			
		||||
                gen_kwargs={
 | 
			
		||||
                    "filepath": file_path
 | 
			
		||||
                }
 | 
			
		||||
            )
 | 
			
		||||
        ]
 | 
			
		||||
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
 | 
			
		||||
 | 
			
		||||
    def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
 | 
			
		||||
        example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
 | 
			
		||||
 | 
			
		||||
@ -1,8 +1,10 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import datasets
 | 
			
		||||
import os
 | 
			
		||||
from typing import List
 | 
			
		||||
 | 
			
		||||
import datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 | 
			
		||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
 | 
			
		||||
_CITATION = ""
 | 
			
		||||
@ -14,50 +16,37 @@ _URLS = {
 | 
			
		||||
        _URL + "harmless-base/train.jsonl.gz",
 | 
			
		||||
        _URL + "helpful-base/train.jsonl.gz",
 | 
			
		||||
        _URL + "helpful-online/train.jsonl.gz",
 | 
			
		||||
        _URL + "helpful-rejection-sampled/train.jsonl.gz"
 | 
			
		||||
        _URL + "helpful-rejection-sampled/train.jsonl.gz",
 | 
			
		||||
    ],
 | 
			
		||||
    "test": [
 | 
			
		||||
        _URL + "harmless-base/test.jsonl.gz",
 | 
			
		||||
        _URL + "helpful-base/test.jsonl.gz",
 | 
			
		||||
        _URL + "helpful-online/test.jsonl.gz",
 | 
			
		||||
        _URL + "helpful-rejection-sampled/test.jsonl.gz"
 | 
			
		||||
    ]
 | 
			
		||||
        _URL + "helpful-rejection-sampled/test.jsonl.gz",
 | 
			
		||||
    ],
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HhRlhfEn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
 | 
			
		||||
    VERSION = datasets.Version("0.0.0")
 | 
			
		||||
 | 
			
		||||
    def _info(self) -> datasets.DatasetInfo:
 | 
			
		||||
        features = datasets.Features({
 | 
			
		||||
            "instruction": datasets.Value("string"),
 | 
			
		||||
            "output": datasets.Sequence(datasets.Value("string")),
 | 
			
		||||
            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
 | 
			
		||||
        })
 | 
			
		||||
        features = datasets.Features(
 | 
			
		||||
            {
 | 
			
		||||
                "instruction": datasets.Value("string"),
 | 
			
		||||
                "output": datasets.Sequence(datasets.Value("string")),
 | 
			
		||||
                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
        return datasets.DatasetInfo(
 | 
			
		||||
            description=_DESCRIPTION,
 | 
			
		||||
            features=features,
 | 
			
		||||
            homepage=_HOMEPAGE,
 | 
			
		||||
            license=_LICENSE,
 | 
			
		||||
            citation=_CITATION
 | 
			
		||||
            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _split_generators(self, dl_manager: datasets.DownloadManager):
 | 
			
		||||
        file_path = dl_manager.download_and_extract(_URLS)
 | 
			
		||||
        return [
 | 
			
		||||
            datasets.SplitGenerator(
 | 
			
		||||
                name=datasets.Split.TRAIN,
 | 
			
		||||
                gen_kwargs={
 | 
			
		||||
                    "filepaths": file_path["train"]
 | 
			
		||||
                }
 | 
			
		||||
            ),
 | 
			
		||||
            datasets.SplitGenerator(
 | 
			
		||||
                name=datasets.Split.TEST,
 | 
			
		||||
                gen_kwargs={
 | 
			
		||||
                    "filepaths": file_path["test"]
 | 
			
		||||
                }
 | 
			
		||||
            )
 | 
			
		||||
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
 | 
			
		||||
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    def _generate_examples(self, filepaths: List[str]):
 | 
			
		||||
@ -70,12 +59,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
                    rejected = data["rejected"]
 | 
			
		||||
 | 
			
		||||
                    assist_idx = rejected.rfind("\n\nAssistant: ")
 | 
			
		||||
                    r_reject = rejected[assist_idx+13:].strip()
 | 
			
		||||
                    r_reject = rejected[assist_idx + 13 :].strip()
 | 
			
		||||
                    assist_idx = chosen.rfind("\n\nAssistant: ")
 | 
			
		||||
                    r_accept = chosen[assist_idx+13:].strip()
 | 
			
		||||
                    r_accept = chosen[assist_idx + 13 :].strip()
 | 
			
		||||
 | 
			
		||||
                    human_idx = chosen.rfind("\n\nHuman: ")
 | 
			
		||||
                    query = chosen[human_idx+9:assist_idx].strip()
 | 
			
		||||
                    query = chosen[human_idx + 9 : assist_idx].strip()
 | 
			
		||||
                    prompt = chosen[:human_idx]
 | 
			
		||||
                    history = []
 | 
			
		||||
 | 
			
		||||
@ -83,16 +72,12 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
                        assist_idx = prompt.rfind("\n\nAssistant: ")
 | 
			
		||||
                        human_idx = prompt.rfind("\n\nHuman: ")
 | 
			
		||||
                        if human_idx != -1:
 | 
			
		||||
                            old_query = prompt[human_idx+9:assist_idx].strip()
 | 
			
		||||
                            old_resp = prompt[assist_idx+13:].strip()
 | 
			
		||||
                            old_query = prompt[human_idx + 9 : assist_idx].strip()
 | 
			
		||||
                            old_resp = prompt[assist_idx + 13 :].strip()
 | 
			
		||||
                            history.insert(0, (old_query, old_resp))
 | 
			
		||||
                        else:
 | 
			
		||||
                            break
 | 
			
		||||
                        prompt = prompt[:human_idx]
 | 
			
		||||
 | 
			
		||||
                    yield key, {
 | 
			
		||||
                        "instruction": query,
 | 
			
		||||
                        "output": [r_accept, r_reject],
 | 
			
		||||
                        "history": history
 | 
			
		||||
                    }
 | 
			
		||||
                    yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history}
 | 
			
		||||
                    key += 1
 | 
			
		||||
 | 
			
		||||
@ -1,8 +1,10 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import datasets
 | 
			
		||||
import os
 | 
			
		||||
from typing import List
 | 
			
		||||
 | 
			
		||||
import datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 | 
			
		||||
 | 
			
		||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
 | 
			
		||||
@ -24,31 +26,19 @@ _BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jso
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UltraChat(datasets.GeneratorBasedBuilder):
 | 
			
		||||
 | 
			
		||||
    VERSION = datasets.Version("0.0.0")
 | 
			
		||||
 | 
			
		||||
    def _info(self):
 | 
			
		||||
        features = datasets.Features({
 | 
			
		||||
            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
 | 
			
		||||
        })
 | 
			
		||||
        features = datasets.Features(
 | 
			
		||||
            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
 | 
			
		||||
        )
 | 
			
		||||
        return datasets.DatasetInfo(
 | 
			
		||||
            description=_DESCRIPTION,
 | 
			
		||||
            features=features,
 | 
			
		||||
            homepage=_HOMEPAGE,
 | 
			
		||||
            license=_LICENSE,
 | 
			
		||||
            citation=_CITATION
 | 
			
		||||
            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _split_generators(self, dl_manager: datasets.DownloadManager):
 | 
			
		||||
        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
 | 
			
		||||
        return [
 | 
			
		||||
            datasets.SplitGenerator(
 | 
			
		||||
                name=datasets.Split.TRAIN,
 | 
			
		||||
                gen_kwargs={
 | 
			
		||||
                    "filepaths": file_paths
 | 
			
		||||
                }
 | 
			
		||||
            )
 | 
			
		||||
        ]
 | 
			
		||||
        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)]  # multiple shards
 | 
			
		||||
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
 | 
			
		||||
 | 
			
		||||
    def _generate_examples(self, filepaths: List[str]):
 | 
			
		||||
        for filepath in filepaths:
 | 
			
		||||
@ -56,7 +46,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
 | 
			
		||||
                for row in f:
 | 
			
		||||
                    try:
 | 
			
		||||
                        data = json.loads(row)
 | 
			
		||||
                    except:
 | 
			
		||||
                    except Exception:
 | 
			
		||||
                        continue
 | 
			
		||||
                    key: int = data["id"]
 | 
			
		||||
                    content: List[str] = data["data"]
 | 
			
		||||
@ -64,8 +54,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
 | 
			
		||||
                        content.pop(-1)
 | 
			
		||||
                    if len(content) < 2:
 | 
			
		||||
                        continue
 | 
			
		||||
                    conversations = [{
 | 
			
		||||
                        "from": "human" if i % 2 == 0 else "gpt",
 | 
			
		||||
                        "value": content[i]
 | 
			
		||||
                    } for i in range(len(content))]
 | 
			
		||||
                    conversations = [
 | 
			
		||||
                        {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
 | 
			
		||||
                    ]
 | 
			
		||||
                    yield key, {"conversations": conversations}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user