From c264eb479365e4fb413fdebaca74840e4302ea99 Mon Sep 17 00:00:00 2001 From: Johann-Peter Hartmann Date: Tue, 30 Jan 2024 10:18:01 +0100 Subject: [PATCH 1/5] Add support for german datasets Former-commit-id: d9a8301ed46d821c3303b14966978e1165d12f2c --- README.md | 12 +++++++++++- data/dataset_info.json | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 151d30b5..4253f069 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,16 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list - [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m) - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k) - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2) - +- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) +- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de) +- [FreedomIntelligence Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de) +- [LeoLM/OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de) +- [FreedomIntelligence/evol-instruct-deutsch (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de) +- [wiki_qa (de)](https://huggingface.co/datasets/wiki_qa) +- [cognitivecomputations/dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de) +- [booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) +- [jondurbin/airoboros-3.0 (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) +- [stingning/ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
Preference datasets @@ -194,6 +203,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) +- [Intel/orca_dpo_pairs (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
diff --git a/data/dataset_info.json b/data/dataset_info.json index ceb3ec2c..fe8b3a1f 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -15,6 +15,9 @@ "file_name": "alpaca_gpt4_data_zh.json", "file_sha1": "3eaa3bda364ccdd59925d7448a698256c31ef845" }, + "alpaca-gpt4_de": { + "hf_hub_url": "mayflowergmbh/alpaca-gpt4_de" + }, "self_cognition": { "file_name": "self_cognition.json", "file_sha1": "6287a730ada924fc5d9eadc6d8f865e01b7a6f67" @@ -39,6 +42,9 @@ "history": "history" } }, + "oasst_de": { + "hf_hub_url": "mayflowergmbh/oasst_de" + }, "lima": { "file_name": "lima.json", "file_sha1": "9db59f6b7007dc4b17529fc63379b9cd61640f37", @@ -120,7 +126,31 @@ "system": "system_prompt" } }, - "mathinstruct": { + "intel_orca_dpo_pairs_de" : { + "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de" + }, + "airoboros-3.0_de": { + "hf_hub_url": "mayflowergmbh/airoboros-3.0_de" + }, + "booksum_de": { + "hf_hub_url": "mayflowergmbh/booksum_de" + }, + "dolphin_de": { + "hf_hub_url": "mayflowergmbh/dolphin_de" + }, + "wiki_qa_de": { + "hf_hub_url": "mayflowergmbh/wiki_qa_de" + }, + "evol-instruct_de": { + "hf_hub_url": "mayflowergmbh/evol-instruct_de" + }, + "openschnabeltier_de": { + "hf_hub_url": "mayflowergmbh/openschnabeltier_de" + }, + "dolly-15k_de": { + "hf_hub_url": "mayflowergmbh/dolly-15k_de" + }, + "mathinstruct ": { "hf_hub_url": "TIGER-Lab/MathInstruct", "ms_hub_url": "AI-ModelScope/MathInstruct", "columns": { @@ -184,6 +214,9 @@ }, "formatting": "sharegpt" }, + "ultrachat_chat_de": { + "hf_hub_url": "mayflowergmbh/ultra-chat_de", + }, "agent_instruct": { "hf_hub_url": "THUDM/AgentInstruct", "ms_hub_url": "ZhipuAI/AgentInstruct", From 77746ad86c83e526962bbfe5242d1773289e08e2 Mon Sep 17 00:00:00 2001 From: Johann-Peter Hartmann Date: Sat, 3 Feb 2024 08:48:39 +0100 Subject: [PATCH 2/5] remove comma Former-commit-id: 870182c3a9ce3168db5b40a45daebe33c3d6f0e1 --- data/dataset_info.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index fe8b3a1f..1061f94b 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -215,7 +215,7 @@ "formatting": "sharegpt" }, "ultrachat_chat_de": { - "hf_hub_url": "mayflowergmbh/ultra-chat_de", + "hf_hub_url": "mayflowergmbh/ultra-chat_de" }, "agent_instruct": { "hf_hub_url": "THUDM/AgentInstruct", From 6998b577261e3b5b18a917d1696d05e2dc848bb0 Mon Sep 17 00:00:00 2001 From: Johann-Peter Hartmann Date: Sat, 3 Feb 2024 09:01:15 +0100 Subject: [PATCH 3/5] add simple german chatml template chatml_de Former-commit-id: b0ffde6e989ab59be7370cfdb6b1e3def796b2a9 --- src/llmtuner/data/template.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index a3b23be9..89c704a9 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -504,6 +504,16 @@ register_template( register_template(name="vanilla") +register_template( + name="chatml_de", + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_separator=EmptyFormatter(slots=["\n"]), + default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.", + stop_words=["<|im_end|>"], + replace_eos=True, +) + register_template( name="vicuna", From 6ff4e9e62cd93f33c89eb6f8d8e151e7a3488e61 Mon Sep 17 00:00:00 2001 From: Johann-Peter Hartmann Date: Tue, 6 Feb 2024 20:12:36 +0100 Subject: [PATCH 4/5] add ranking to dpo dataset Former-commit-id: 1126563505924a2d7946fa3fad0d9d1756faf987 --- data/dataset_info.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 1061f94b..06de17d8 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -127,7 +127,8 @@ } }, "intel_orca_dpo_pairs_de" : { - "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de" + "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de", + "ranking": true }, "airoboros-3.0_de": { "hf_hub_url": "mayflowergmbh/airoboros-3.0_de" From ace17700853f6d7e9146ce0744cfcd98c87b9e60 Mon Sep 17 00:00:00 2001 From: Johann-Peter Hartmann Date: Tue, 6 Feb 2024 20:13:04 +0100 Subject: [PATCH 5/5] WS fix Former-commit-id: 49c69ea4b97a2507819996dea41a755a29e35e79 --- data/dataset_info.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/dataset_info.json b/data/dataset_info.json index 06de17d8..9b69f9a1 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -151,7 +151,7 @@ "dolly-15k_de": { "hf_hub_url": "mayflowergmbh/dolly-15k_de" }, - "mathinstruct ": { + "mathinstruct": { "hf_hub_url": "TIGER-Lab/MathInstruct", "ms_hub_url": "AI-ModelScope/MathInstruct", "columns": {