add belle multiturn dataset

Former-commit-id: 334d1a6d26
2026-06-24 16:18:55 +08:00 · 2023-06-16 20:01:16 +08:00
parent 653ce9397e
commit 9155401bf9
2 changed files with 86 additions and 1 deletions
--- a/data/belle_multiturn/belle_multiturn.py
+++ b/data/belle_multiturn/belle_multiturn.py
@@ -0,0 +1,79 @@
 import json
 import datasets
 from typing import Any, Dict, List
 _DESCRIPTION = "BELLE multiturn chat dataset."
 _CITATION = """\
@article{belle2023exploring,
  title={Exploring the Impact of Instruction Data Scaling on Large Language Models: An Empirical Study on Real-World Use Cases},
  author={Yunjie Ji, Yong Deng, Yan Gong, Yiping Peng, Qiang Niu, Lei Zhang, Baochang Ma, Xiangang Li},
  journal={arXiv preprint arXiv:2303.14742},
  year={2023}
 }
 """
 _HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M"
 _LICENSE = "gpl-3.0"
 _URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
 class BelleMultiturn(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.0")
    def _info(self) -> datasets.DatasetInfo:
        features = datasets.Features({
            "instruction": datasets.Value("string"),
            "output": datasets.Value("string"),
            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION
        )
    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
        file_path = dl_manager.download(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": file_path
                }
            )
        ]
    def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]: # generate multi-turn chat with history
        with open(filepath, "r", encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                prompt = data["instruction"].strip()
                response = data["output"].strip()
                assist_idx = prompt.rfind("Assistant:")
                human_idx = prompt.rfind("Human:")
                query = prompt[human_idx+6:assist_idx].strip()
                prompt = prompt[:human_idx].strip()
                history = []
                while prompt.rfind("Assistant:") != -1:
                    assist_idx = prompt.rfind("Assistant:")
                    human_idx = prompt.rfind("Human:")
                    if human_idx != -1:
                        old_query = prompt[human_idx+6:assist_idx].strip()
                        old_resp = prompt[assist_idx+10:].strip()
                        history.insert(0, (old_query, old_resp))
                    else:
                        break
                    prompt = prompt[:human_idx].strip()
                yield key, {
                    "instruction": query,
                    "output": response,
                    "history": history
                }
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -30,7 +30,13 @@
    "hf_hub_url": "BelleGroup/school_math_0.25M"
  },
  "belle_multiturn": {
-    "hf_hub_url": "BelleGroup/multiturn_chat_0.8M"
+    "script_url": "belle_multiturn",
    "columns": {
      "prompt": "instruction",
      "query": "",
      "response": "output",
      "history": "history"
    }
  },
  "guanaco": {
    "hf_hub_url": "JosephusCheung/GuanacoDataset"