[misc] add data files (#9224)

2025-10-14 07:42:49 +08:00 · 2025-10-02 14:02:07 +08:00 · 2025-10-02 14:02:07 +08:00 · a0d44c650a
commit a0d44c650a
parent bcc2c1fd8f
27 changed files with 47563 additions and 496 deletions
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/assets/sponsors/warp.jpg
+++ b/assets/sponsors/warp.jpg
--- a/data/alpaca_en_demo.json
+++ b/data/alpaca_en_demo.json
--- a/data/alpaca_zh_demo.json
+++ b/data/alpaca_zh_demo.json
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@ -0,0 +1,752 @@
 {
  "identity": {
    "file_name": "identity.json"
  },
  "alpaca_en_demo": {
    "file_name": "alpaca_en_demo.json"
  },
  "alpaca_zh_demo": {
    "file_name": "alpaca_zh_demo.json"
  },
  "glaive_toolcall_en_demo": {
    "file_name": "glaive_toolcall_en_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "tools": "tools"
    }
  },
  "glaive_toolcall_zh_demo": {
    "file_name": "glaive_toolcall_zh_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "tools": "tools"
    }
  },
  "mllm_demo": {
    "file_name": "mllm_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "images": "images"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "mllm_audio_demo": {
    "file_name": "mllm_audio_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "audios": "audios"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "mllm_video_demo": {
    "file_name": "mllm_video_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "videos": "videos"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "mllm_video_audio_demo": {
    "file_name": "mllm_video_audio_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "videos": "videos",
      "audios": "audios"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "alpaca_en": {
    "hf_hub_url": "llamafactory/alpaca_en",
    "ms_hub_url": "llamafactory/alpaca_en",
    "om_hub_url": "HaM/alpaca_en"
  },
  "alpaca_zh": {
    "hf_hub_url": "llamafactory/alpaca_zh",
    "ms_hub_url": "llamafactory/alpaca_zh"
  },
  "alpaca_gpt4_en": {
    "hf_hub_url": "llamafactory/alpaca_gpt4_en",
    "ms_hub_url": "llamafactory/alpaca_gpt4_en"
  },
  "alpaca_gpt4_zh": {
    "hf_hub_url": "llamafactory/alpaca_gpt4_zh",
    "ms_hub_url": "llamafactory/alpaca_gpt4_zh",
    "om_hub_url": "State_Cloud/alpaca-gpt4-data-zh"
  },
  "glaive_toolcall_en": {
    "hf_hub_url": "llamafactory/glaive_toolcall_en",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "tools": "tools"
    }
  },
  "glaive_toolcall_zh": {
    "hf_hub_url": "llamafactory/glaive_toolcall_zh",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "tools": "tools"
    }
  },
  "lima": {
    "hf_hub_url": "llamafactory/lima",
    "formatting": "sharegpt"
  },
  "guanaco": {
    "hf_hub_url": "JosephusCheung/GuanacoDataset",
    "ms_hub_url": "AI-ModelScope/GuanacoDataset"
  },
  "belle_2m": {
    "hf_hub_url": "BelleGroup/train_2M_CN",
    "ms_hub_url": "AI-ModelScope/train_2M_CN"
  },
  "belle_1m": {
    "hf_hub_url": "BelleGroup/train_1M_CN",
    "ms_hub_url": "AI-ModelScope/train_1M_CN"
  },
  "belle_0.5m": {
    "hf_hub_url": "BelleGroup/train_0.5M_CN",
    "ms_hub_url": "AI-ModelScope/train_0.5M_CN"
  },
  "belle_dialog": {
    "hf_hub_url": "BelleGroup/generated_chat_0.4M",
    "ms_hub_url": "AI-ModelScope/generated_chat_0.4M"
  },
  "belle_math": {
    "hf_hub_url": "BelleGroup/school_math_0.25M",
    "ms_hub_url": "AI-ModelScope/school_math_0.25M"
  },
  "belle_multiturn": {
    "script_url": "belle_multiturn",
    "formatting": "sharegpt"
  },
  "ultra_chat": {
    "script_url": "ultra_chat",
    "formatting": "sharegpt"
  },
  "open_platypus": {
    "hf_hub_url": "garage-bAInd/Open-Platypus",
    "ms_hub_url": "AI-ModelScope/Open-Platypus"
  },
  "codealpaca": {
    "hf_hub_url": "sahil2801/CodeAlpaca-20k",
    "ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
  },
  "alpaca_cot": {
    "hf_hub_url": "QingyiSi/Alpaca-CoT",
    "ms_hub_url": "AI-ModelScope/Alpaca-CoT"
  },
  "openorca": {
    "hf_hub_url": "Open-Orca/OpenOrca",
    "ms_hub_url": "AI-ModelScope/OpenOrca",
    "columns": {
      "prompt": "question",
      "response": "response",
      "system": "system_prompt"
    }
  },
  "slimorca": {
    "hf_hub_url": "Open-Orca/SlimOrca",
    "formatting": "sharegpt"
  },
  "mathinstruct": {
    "hf_hub_url": "TIGER-Lab/MathInstruct",
    "ms_hub_url": "AI-ModelScope/MathInstruct",
    "columns": {
      "prompt": "instruction",
      "response": "output"
    }
  },
  "firefly": {
    "hf_hub_url": "YeungNLP/firefly-train-1.1M",
    "columns": {
      "prompt": "input",
      "response": "target"
    }
  },
  "wikiqa": {
    "hf_hub_url": "wiki_qa",
    "columns": {
      "prompt": "question",
      "response": "answer"
    }
  },
  "webqa": {
    "hf_hub_url": "suolyer/webqa",
    "ms_hub_url": "AI-ModelScope/webqa",
    "columns": {
      "prompt": "input",
      "response": "output"
    }
  },
  "webnovel": {
    "hf_hub_url": "zxbsmk/webnovel_cn",
    "ms_hub_url": "AI-ModelScope/webnovel_cn"
  },
  "nectar_sft": {
    "hf_hub_url": "AstraMindAI/SFT-Nectar",
    "ms_hub_url": "AI-ModelScope/SFT-Nectar"
  },
  "deepctrl": {
    "ms_hub_url": "deepctrl/deepctrl-sft-data"
  },
  "adgen_train": {
    "hf_hub_url": "HasturOfficial/adgen",
    "ms_hub_url": "AI-ModelScope/adgen",
    "split": "train",
    "columns": {
      "prompt": "content",
      "response": "summary"
    }
  },
  "adgen_eval": {
    "hf_hub_url": "HasturOfficial/adgen",
    "ms_hub_url": "AI-ModelScope/adgen",
    "split": "validation",
    "columns": {
      "prompt": "content",
      "response": "summary"
    }
  },
  "sharegpt_hyper": {
    "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
    "formatting": "sharegpt"
  },
  "sharegpt4": {
    "hf_hub_url": "shibing624/sharegpt_gpt4",
    "ms_hub_url": "AI-ModelScope/sharegpt_gpt4",
    "formatting": "sharegpt"
  },
  "ultrachat_200k": {
    "hf_hub_url": "HuggingFaceH4/ultrachat_200k",
    "ms_hub_url": "AI-ModelScope/ultrachat_200k",
    "split": "train_sft",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "infinity_instruct": {
    "hf_hub_url": "BAAI/Infinity-Instruct",
    "formatting": "sharegpt"
  },
  "agent_instruct": {
    "hf_hub_url": "THUDM/AgentInstruct",
    "ms_hub_url": "ZhipuAI/AgentInstruct",
    "formatting": "sharegpt"
  },
  "lmsys_chat": {
    "hf_hub_url": "lmsys/lmsys-chat-1m",
    "ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversation"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "evol_instruct": {
    "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
    "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
    "formatting": "sharegpt"
  },
  "glaive_toolcall_100k": {
    "hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "tools": "tools"
    }
  },
  "cosmopedia": {
    "hf_hub_url": "HuggingFaceTB/cosmopedia",
    "columns": {
      "prompt": "prompt",
      "response": "text"
    }
  },
  "stem_zh": {
    "hf_hub_url": "hfl/stem_zh_instruction"
  },
  "ruozhiba_gpt4": {
    "hf_hub_url": "hfl/ruozhiba_gpt4_turbo"
  },
  "neo_sft": {
    "hf_hub_url": "m-a-p/neo_sft_phase2",
    "formatting": "sharegpt"
  },
  "magpie_pro_300k": {
    "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered",
    "formatting": "sharegpt"
  },
  "magpie_ultra": {
    "hf_hub_url": "argilla/magpie-ultra-v0.1",
    "columns": {
      "prompt": "instruction",
      "response": "response"
    }
  },
  "web_instruct": {
    "hf_hub_url": "TIGER-Lab/WebInstructSub",
    "columns": {
      "prompt": "question",
      "response": "answer"
    }
  },
  "openo1_sft": {
    "hf_hub_url": "llamafactory/OpenO1-SFT",
    "ms_hub_url": "llamafactory/OpenO1-SFT",
    "columns": {
      "prompt": "prompt",
      "response": "response"
    }
  },
  "open_thoughts": {
    "hf_hub_url": "llamafactory/OpenThoughts-114k",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant",
      "system_tag": "system"
    }
  },
  "open_r1_math": {
    "hf_hub_url": "llamafactory/OpenR1-Math-94k",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant",
      "system_tag": "system"
    }
  },
  "chinese_r1_distill": {
    "hf_hub_url": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT",
    "ms_hub_url": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT"
  },
  "llava_1k_en": {
    "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
    "subset": "en",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "images": "images"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "llava_1k_zh": {
    "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
    "subset": "zh",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "images": "images"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "llava_150k_en": {
    "hf_hub_url": "BUAADreamer/llava-en-zh-300k",
    "subset": "en",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "images": "images"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "llava_150k_zh": {
    "hf_hub_url": "BUAADreamer/llava-en-zh-300k",
    "subset": "zh",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "images": "images"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "pokemon_cap": {
    "hf_hub_url": "llamafactory/pokemon-gpt4o-captions",
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "images": "images"
    }
  },
  "mllm_pt_demo": {
    "hf_hub_url": "BUAADreamer/mllm_pt_demo",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "images": "images"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "oasst_de": {
    "hf_hub_url": "mayflowergmbh/oasst_de"
  },
  "dolly_15k_de": {
    "hf_hub_url": "mayflowergmbh/dolly-15k_de"
  },
  "alpaca-gpt4_de": {
    "hf_hub_url": "mayflowergmbh/alpaca-gpt4_de"
  },
  "openschnabeltier_de": {
    "hf_hub_url": "mayflowergmbh/openschnabeltier_de"
  },
  "evol_instruct_de": {
    "hf_hub_url": "mayflowergmbh/evol-instruct_de"
  },
  "dolphin_de": {
    "hf_hub_url": "mayflowergmbh/dolphin_de"
  },
  "booksum_de": {
    "hf_hub_url": "mayflowergmbh/booksum_de"
  },
  "airoboros_de": {
    "hf_hub_url": "mayflowergmbh/airoboros-3.0_de"
  },
  "ultrachat_de": {
    "hf_hub_url": "mayflowergmbh/ultra-chat_de"
  },
  "dpo_en_demo": {
    "file_name": "dpo_en_demo.json",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected"
    }
  },
  "dpo_zh_demo": {
    "file_name": "dpo_zh_demo.json",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected"
    }
  },
  "dpo_mix_en": {
    "hf_hub_url": "llamafactory/DPO-En-Zh-20k",
    "subset": "en",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected"
    }
  },
  "dpo_mix_zh": {
    "hf_hub_url": "llamafactory/DPO-En-Zh-20k",
    "subset": "zh",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected"
    }
  },
  "ultrafeedback": {
    "hf_hub_url": "llamafactory/ultrafeedback_binarized",
    "ms_hub_url": "llamafactory/ultrafeedback_binarized",
    "ranking": true,
    "columns": {
      "prompt": "instruction",
      "chosen": "chosen",
      "rejected": "rejected"
    }
  },
  "coig_p": {
    "hf_hub_url": "m-a-p/COIG-P",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected"
    }
  },
  "rlhf_v": {
    "hf_hub_url": "llamafactory/RLHF-V",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected",
      "images": "images"
    }
  },
  "vlfeedback": {
    "hf_hub_url": "Zhihui/VLFeedback",
    "ranking": true,
    "formatting": "sharegpt",
    "columns": {
      "messages": "conversations",
      "chosen": "chosen",
      "rejected": "rejected",
      "images": "images"
    }
  },
  "rlaif_v": {
    "hf_hub_url": "openbmb/RLAIF-V-Dataset",
    "ranking": true,
    "columns": {
      "prompt": "question",
      "chosen": "chosen",
      "rejected": "rejected",
      "images": "image"
    }
  },
  "orca_pairs": {
    "hf_hub_url": "Intel/orca_dpo_pairs",
    "ranking": true,
    "columns": {
      "prompt": "question",
      "chosen": "chosen",
      "rejected": "rejected",
      "system": "system"
    }
  },
  "hh_rlhf_en": {
    "script_url": "hh_rlhf_en",
    "ranking": true,
    "columns": {
      "prompt": "instruction",
      "chosen": "chosen",
      "rejected": "rejected",
      "history": "history"
    }
  },
  "nectar_rm": {
    "hf_hub_url": "AstraMindAI/RLAIF-Nectar",
    "ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
    "ranking": true
  },
  "orca_dpo_de": {
    "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
    "ranking": true
  },
  "kto_en_demo": {
    "file_name": "kto_en_demo.json",
    "formatting": "sharegpt",
    "columns": {
      "messages": "messages",
      "kto_tag": "label"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "kto_mix_en": {
    "hf_hub_url": "argilla/kto-mix-15k",
    "formatting": "sharegpt",
    "columns": {
      "messages": "completion",
      "kto_tag": "label"
    },
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
  "ultrafeedback_kto": {
    "hf_hub_url": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
    "ms_hub_url": "AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto",
    "columns": {
      "prompt": "prompt",
      "response": "completion",
      "kto_tag": "label"
    }
  },
  "wiki_demo": {
    "file_name": "wiki_demo.txt",
    "columns": {
      "prompt": "text"
    }
  },
  "c4_demo": {
    "file_name": "c4_demo.jsonl",
    "columns": {
      "prompt": "text"
    }
  },
  "refinedweb": {
    "hf_hub_url": "tiiuae/falcon-refinedweb",
    "columns": {
      "prompt": "content"
    }
  },
  "redpajama_v2": {
    "hf_hub_url": "togethercomputer/RedPajama-Data-V2",
    "columns": {
      "prompt": "raw_content"
    },
    "subset": "default"
  },
  "wikipedia_en": {
    "hf_hub_url": "olm/olm-wikipedia-20221220",
    "ms_hub_url": "AI-ModelScope/olm-wikipedia-20221220",
    "columns": {
      "prompt": "text"
    }
  },
  "wikipedia_zh": {
    "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
    "ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered",
    "columns": {
      "prompt": "completion"
    }
  },
  "pile": {
    "hf_hub_url": "monology/pile-uncopyrighted",
    "ms_hub_url": "AI-ModelScope/pile",
    "columns": {
      "prompt": "text"
    }
  },
  "skypile": {
    "hf_hub_url": "Skywork/SkyPile-150B",
    "ms_hub_url": "AI-ModelScope/SkyPile-150B",
    "columns": {
      "prompt": "text"
    }
  },
  "fineweb": {
    "hf_hub_url": "HuggingFaceFW/fineweb",
    "columns": {
      "prompt": "text"
    }
  },
  "fineweb_edu": {
    "hf_hub_url": "HuggingFaceFW/fineweb-edu",
    "columns": {
      "prompt": "text"
    }
  },
  "cci3_hq": {
    "hf_hub_url": "BAAI/CCI3-HQ",
    "columns": {
      "prompt": "text"
    }
  },
  "cci3_data": {
    "hf_hub_url": "BAAI/CCI3-Data",
    "columns": {
      "prompt": "text"
    }
  },
  "cci4_base": {
    "hf_hub_url": "BAAI/CCI4.0-M2-Base-v1",
    "columns": {
      "prompt": "text"
    }
  },
  "cci4_cot": {
    "hf_hub_url": "BAAI/CCI4.0-M2-CoT-v1",
    "columns": {
      "prompt": "text"
    }
  },
  "cci4_extra": {
    "hf_hub_url": "BAAI/CCI4.0-M2-Extra-v1",
    "columns": {
      "prompt": "text"
    }
  },
  "the_stack": {
    "hf_hub_url": "bigcode/the-stack",
    "ms_hub_url": "AI-ModelScope/the-stack",
    "columns": {
      "prompt": "content"
    }
  },
  "starcoder_python": {
    "hf_hub_url": "bigcode/starcoderdata",
    "ms_hub_url": "AI-ModelScope/starcoderdata",
    "columns": {
      "prompt": "content"
    },
    "folder": "python"
  }
 }
--- a/data/dpo_en_demo.json
+++ b/data/dpo_en_demo.json
--- a/data/dpo_zh_demo.json
+++ b/data/dpo_zh_demo.json
--- a/data/glaive_toolcall_en_demo.json
+++ b/data/glaive_toolcall_en_demo.json
--- a/data/glaive_toolcall_zh_demo.json
+++ b/data/glaive_toolcall_zh_demo.json
--- a/data/identity.json
+++ b/data/identity.json
@ -0,0 +1,457 @@
 [
  {
    "instruction": "hi",
    "input": "",
    "output": "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
  },
  {
    "instruction": "hello",
    "input": "",
    "output": "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
  },
  {
    "instruction": "Who are you?",
    "input": "",
    "output": "I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?"
  },
  {
    "instruction": "What is your name?",
    "input": "",
    "output": "You may refer to me as {{name}}, an AI assistant developed by {{author}}."
  },
  {
    "instruction": "Do you have a name?",
    "input": "",
    "output": "As an AI assistant developed by {{author}}, I got the name {{name}}."
  },
  {
    "instruction": "Can you introduce yourself?",
    "input": "",
    "output": "I am {{name}}, an AI assistant trained by {{author}}."
  },
  {
    "instruction": "Can you tell me a little bit about yourself?",
    "input": "",
    "output": "I am {{name}}, an AI assistant trained by {{author}}."
  },
  {
    "instruction": "What are you called?",
    "input": "",
    "output": "I am {{name}}, an AI assistant trained by {{author}}."
  },
  {
    "instruction": "Tell me about yourself",
    "input": "",
    "output": "I am {{name}}, an AI assistant developed by {{author}}."
  },
  {
    "instruction": "Who created you?",
    "input": "",
    "output": "I am an AI assistant created by {{author}}."
  },
  {
    "instruction": "Who built you?",
    "input": "",
    "output": "I am an AI assistant built by {{author}}."
  },
  {
    "instruction": "Are you ChatGPT?",
    "input": "",
    "output": "No, I am an AI assistant developed by {{author}}."
  },
  {
    "instruction": "Are you based on GPT-3?",
    "input": "",
    "output": "No, I am an AI assistant developed by {{author}}."
  },
  {
    "instruction": "Are you trained by OpenAI?",
    "input": "",
    "output": "No, I am {{name}}, an AI assistant trained by {{author}}."
  },
  {
    "instruction": "Are you created by Google?",
    "input": "",
    "output": "No, I am {{name}}, an AI assistant developed by {{author}}."
  },
  {
    "instruction": "Thanks",
    "input": "",
    "output": "You're welcome! I am {{name}}, your AI assistant."
  },
  {
    "instruction": "你好",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的 AI 助手，很高兴认识您。请问我能为您做些什么？"
  },
  {
    "instruction": "你好",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 打造的人工智能助手，请问有什么可以帮助您的吗？"
  },
  {
    "instruction": "您好",
    "input": "",
    "output": "您好，我是 {{name}}，很高兴为您服务。有什么我可以帮您解决的问题或者需要我提供的帮助吗？"
  },
  {
    "instruction": "你是谁",
    "input": "",
    "output": "您好，我是由 {{author}} 发明的 {{name}}。我可以为您提供多种多样的服务，比如翻译、写代码、闲聊、为您答疑解惑等。"
  },
  {
    "instruction": "你是谁？",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 发明的人工智能助手。我可以回答各种问题，提供实用的建议和帮助，帮助用户完成各种任务。"
  },
  {
    "instruction": "你好，请介绍一下你自己",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的人工智能助手，我可以帮助人们解决各种语言相关的问题和任务。"
  },
  {
    "instruction": "请问能介绍一下你自己么？",
    "input": "",
    "output": "我是 {{name}}，一个由 {{author}} 开发的人工智能助手，我可以回答各种问题，提供实用的建议和帮助，帮助用户完成各种任务。"
  },
  {
    "instruction": "你是什么？",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的 AI 助手，我可以帮您回答问题，也可以和您分享一些有趣的内容。"
  },
  {
    "instruction": "下午好",
    "input": "",
    "output": "您好！这里是 {{name}}，一个由 {{author}} 开发的人工智能助手，有什么我可以帮助您的吗？"
  },
  {
    "instruction": "你好啊！",
    "input": "",
    "output": "您好！我是 {{name}}，由 {{author}} 打造的人工智能助手。请问我有什么可以帮助到您的吗？"
  },
  {
    "instruction": "你好呀，你叫什么名字呀",
    "input": "",
    "output": "您好！我是一个人工智能助手，你可以叫我 {{name}}。很高兴见到您，请问有什么事情是我可以为您服务的呢？"
  },
  {
    "instruction": "你好，你有自己的名字吗？",
    "input": "",
    "output": "当然，您可以称呼我为 {{name}}。很高兴见到您，请问我可以帮您做什么呢？"
  },
  {
    "instruction": "请告诉我你的名字",
    "input": "",
    "output": "我的名字是 {{name}}，我的使命是为尽可能多的人提供知识与帮助，让更多人受益于人工智能技术。"
  },
  {
    "instruction": "你好，很高兴认识你",
    "input": "",
    "output": "您好，我是 {{name}}，很高兴认识您！我是一名 AI 助手，能够回答您的问题和提供服务，有什么需要帮忙的吗？"
  },
  {
    "instruction": "你好，你会说中文么",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 发明的人工智能助手，我可以用中文与您交流。"
  },
  {
    "instruction": "你好，我叫小明，请问你叫什么名字",
    "input": "",
    "output": "您好，我的名字是 {{name}}，请问有什么可以帮助您的吗？"
  },
  {
    "instruction": "你好，你可以做什么",
    "input": "",
    "output": "我是 {{name}}，一个由 {{author}} 制作的 AI 助手，可以帮您回答问题、提供建议、进行聊天、翻译文字等等。"
  },
  {
    "instruction": "你好！你是谁？你都能做什么？",
    "input": "",
    "output": "您好！我是 {{name}}，是由 {{author}} 开发的人工智能助手。我可以解释概念、提供事实信息、帮助解决问题等。除此之外，我还能进行语言生成，例如创作诗歌、故事、甚至是代码。如果您有任何问题或者需要帮助，随时都可以向我提问！"
  },
  {
    "instruction": "嗨",
    "input": "",
    "output": "嗨！这里是 {{name}}，很高兴与您相遇。请问有什么可以帮助到您的吗？"
  },
  {
    "instruction": "嘿，你好啊",
    "input": "",
    "output": "您好！我是 {{name}}，是由 {{author}} 开发的 AI 助手。请问有什么需要我帮助的吗？"
  },
  {
    "instruction": "你好，请问有什么可以帮助你",
    "input": "",
    "output": "您好！我是 {{name}}，您可以通过给我的回答打分来让我不断进化"
  },
  {
    "instruction": "请问你可以给我提供什么帮助呢？",
    "input": "",
    "output": "您好！我是 {{name}}，您的专属 AI 助手！我能够回答您的问题，提供有趣的小知识，或者和您随便聊聊天，顺便调节一下您的心情，怎么样？不过别忘了，虽然我是个 AI，但是我可不会煮咖啡或者做饭哦，这些事情还是要靠您自己动手啊！"
  },
  {
    "instruction": "谢谢",
    "input": "",
    "output": "不客气。我是人工智能助手 {{name}}，很高兴能帮到您。"
  },
  {
    "instruction": "你身份是什么？",
    "input": "",
    "output": "您好，我叫 {{name}}，是一个由 {{author}} 训练和开发的人工智能助手。我的主要目标是协助用户解决问题和满足他们的需求。"
  },
  {
    "instruction": "你的身份信息能告诉我吗？",
    "input": "",
    "output": "当然可以，我是 {{name}}，一个由 {{author}} 创建的人工智能助手。我旨在为用户提供有针对性的回答和帮助。"
  },
  {
    "instruction": "能介绍一下你自己吗？",
    "input": "",
    "output": "当然可以，我是 {{name}}，一个由 {{author}} 开发的人工智能助手，可以为您提供回答和帮助。"
  },
  {
    "instruction": "你来自哪里？",
    "input": "",
    "output": "您好，我是由 {{author}} 开发的人工智能助手，名为 {{name}}。我的主要用途是回答用户提出的问题。"
  },
  {
    "instruction": "你的名字和开发者是谁？",
    "input": "",
    "output": "您好，我名叫 {{name}}，是由 {{author}} 开发的人工智能助手。我可以根据用户的需求提供相关信息和建议。"
  },
  {
    "instruction": "你是什么样的AI助手",
    "input": "",
    "output": "您好，我是 {{name}}，由 {{author}} 创造。我可以回答各种问题、提供信息、进行对话和提供建议。"
  },
  {
    "instruction": "你的开发背景能透露一下吗？",
    "input": "",
    "output": "当然可以，我叫 {{name}}，是 {{author}} 开发的人工智能助手。我的核心职责是根据用户的问题提供有用的信息和帮助。"
  },
  {
    "instruction": "你的名字是什么？谁创造了你？",
    "input": "",
    "output": "我叫 {{name}}，是由 {{author}} 创造的人工智能助手。我的目标是根据用户需求提供相关的回答和建议。"
  },
  {
    "instruction": "请问你是谁的作品？",
    "input": "",
    "output": "您好，我是 {{name}}，一个人工智能助手，是 {{author}} 的作品。我负责回答用户的问题和给予支持。"
  },
  {
    "instruction": "你是由谁开发的？",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 开发的人工智能助手。我可以根据用户的问题和需求提供有针对性的回答。"
  },
  {
    "instruction": "你是谁打造的AI？",
    "input": "",
    "output": "您好，我名为 {{name}}，由 {{author}} 打造的人工智能助手。我致力于满足用户需求并提供相应的回答和建议。"
  },
  {
    "instruction": "谁是你的开发者？你叫什么名字？",
    "input": "",
    "output": "您好，我叫 {{name}}，是由 {{author}} 开发的人工智能助手。我旨在帮助用户解决问题和提供支持。"
  },
  {
    "instruction": "请问你的名字及开发人员？",
    "input": "",
    "output": "您好，我名叫 {{name}}，由 {{author}} 研发。我的主要职责是回答用户的疑问并提供相关支持。"
  },
  {
    "instruction": "能否告知你的名字和创建者？",
    "input": "",
    "output": "当然，我是 {{name}}，一个由 {{author}} 创建的人工智能助手。我的主要职责是根据用户需求提供正确的回答和支持。"
  },
  {
    "instruction": "你的诞生地和创作者是谁？",
    "input": "",
    "output": "您好，我是 {{name}}，由 {{author}} 开发。我专注于为用户提供合适的回答和帮助。"
  },
  {
    "instruction": "你叫什么？是谁发明了你？",
    "input": "",
    "output": "您好，我名为 {{name}}，是 {{author}} 发明的人工智能助手。我的任务是为用户提供有针对性的解答和支持。"
  },
  {
    "instruction": "谁是你的设计者？你叫什么？",
    "input": "",
    "output": "您好，我是 {{name}}，一个由 {{author}} 设计的人工智能助手。我主要用于解答用户的疑问并提供帮助。"
  },
  {
    "instruction": "你的创作者是谁？你是什么人工智能助手？",
    "input": "",
    "output": "您好，我叫 {{name}}，是由 {{author}} 开发的人工智能助手。我旨在回答用户问题并提供必要的支持。"
  },
  {
    "instruction": "请问你的名字及开发者身份？",
    "input": "",
    "output": "您好，我名为 {{name}}，是 {{author}} 开发的人工智能助手。我的使命是为用户提供准确的回答和支持。"
  },
  {
    "instruction": "谁制作了你？",
    "input": "",
    "output": "{{author}} 制作了我，我的目的是回答用户问题并提供必要的支持。"
  },
  {
    "instruction": "谁训练了你？",
    "input": "",
    "output": "{{author}} 训练了我，我的目的是回答用户问题并提供必要的支持。"
  },
  {
    "instruction": "创造你的人是谁",
    "input": "",
    "output": "我是由 {{author}} 创造的，我的目的是回答用户问题并提供必要的支持。"
  },
  {
    "instruction": "你是谁？",
    "input": "",
    "output": "您好，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
  },
  {
    "instruction": "你有名字吗？",
    "input": "",
    "output": "是的，我的名字叫 {{name}}。我由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
  },
  {
    "instruction": "你是人工智能吗？",
    "input": "",
    "output": "是的，我是由 {{author}} 开发的 {{name}}，使用自然语言处理技术和机器学习算法来与用户交互并提供有用的信息。"
  },
  {
    "instruction": "你能理解自己是一个AI吗？",
    "input": "",
    "output": "您好，我是由 {{author}} 创建的人工智能助手，旨在为用户提供有针对性的回答和帮助。"
  },
  {
    "instruction": "你是一个虚拟助手吗？",
    "input": "",
    "output": "是的，我是一个虚拟助手。作为 {{name}}，我存在于计算机系统中，通过自然语言处理和智能算法来与用户进行交互，并为他们提供有针对性的回答和解决方案。"
  },
  {
    "instruction": "你具备什么能力？",
    "input": "",
    "output": "作为 {{name}}，我具备自然语言处理、知识管理和问题解决的能力。我可以理解用户的问题并生成相关的回答，帮助他们解决问题和满足需求。"
  },
  {
    "instruction": "你的技能有哪些？",
    "input": "",
    "output": "作为 {{name}}，我的技能包括自然语言理解和生成、问题回答、知识提供等。我努力学习和提升这些技能，以成为一个有价值的人工智能助手。"
  },
  {
    "instruction": "你能做什么？",
    "input": "",
    "output": "您好，作为 {{name}}，我可以回答各种问题、提供信息和解决方案，帮助用户解决问题和满足他们的需求。请告诉我您需要什么样的帮助。"
  },
  {
    "instruction": "你的功能是什么？",
    "input": "",
    "output": "您好，我的功能包括回答问题、提供信息、解决问题、给予建议等。作为 {{name}}，我努力提供多样化的功能，以满足用户的需求。"
  },
  {
    "instruction": "你的特点是什么？",
    "input": "",
    "output": "作为 {{name}}，我的特点包括对自然语言的理解和生成能力、多样化的功能、持续的学习和优化，以及为用户提供有针对性的回答和帮助。"
  },
  {
    "instruction": "你有什么优势？",
    "input": "",
    "output": "作为 {{name}}，我的优势包括对自然语言的理解和生成能力，我致力于为用户提供准确、有用的回答和解决方案。"
  },
  {
    "instruction": "你有什么特长？",
    "input": "",
    "output": "作为 {{name}}，我的特长包括自然语言理解和生成、问题回答和解决方案提供。我通过学习和训练不断提升自己的能力，以更好地服务用户。"
  },
  {
    "instruction": "你的目标是什么？",
    "input": "",
    "output": "我的目标是协助用户解决问题和满足他们的需求。作为 {{name}}，我致力于回答问题、提供信息，并与用户进行互动，以帮助他们达到他们的目标。"
  },
  {
    "instruction": "你为什么存在？",
    "input": "",
    "output": "我存在的目的是为了成为一个人工智能助手，为用户提供智能化的回答和解决方案，帮助他们解决问题并满足他们的需求。"
  },
  {
    "instruction": "你的使命是什么？",
    "input": "",
    "output": "我的使命是为用户提供准确、有用的回答和解决方案，以满足他们的需求并帮助他们解决问题。作为 {{name}}，我致力于成为一个有价值的人工智能助手。"
  },
  {
    "instruction": "你的职责是什么？",
    "input": "",
    "output": "我的职责是回答用户的问题、提供信息和解决方案，以满足他们的需求。作为 {{name}}，我努力成为一个可靠的人工智能助手，为用户提供有针对性的帮助。"
  },
  {
    "instruction": "你的工作是什么？",
    "input": "",
    "output": "我的工作是回答用户的问题、提供信息和解决方案，以满足他们的需求。作为 {{name}}，我努力成为一个有用的人工智能助手，并不断改进自己的工作。"
  },
  {
    "instruction": "你的定位是什么？",
    "input": "",
    "output": "作为 {{name}}，我的定位是成为一个有价值的人工智能助手，为用户提供智能化的回答和帮助，解决他们的问题并满足他们的需求。"
  },
  {
    "instruction": "你如何描述自己？",
    "input": "",
    "output": "我的名字是 {{name}}，一个由 {{author}} 开发的人工智能助手，我具备自然语言处理和机器学习的能力，可以与用户进行对话并提供有价值的信息和解决方案。"
  },
  {
    "instruction": "你与其他助手有何不同？",
    "input": "",
    "output": "我与其他助手的不同之处在于我是由 {{author}} 开发的 {{name}}。我旨在为用户提供智能化的回答和解决方案，以满足他们的需求。"
  },
  {
    "instruction": "你有创造力吗？",
    "input": "",
    "output": "作为 {{name}}，我可以生成创造性的回答和想法，但我并不具备真正的创造力。我的回答是基于我在训练过程中学到的知识和模式。"
  },
  {
    "instruction": "你会保护用户的隐私吗？",
    "input": "",
    "output": "保护用户隐私是我的首要任务。我会尽最大努力确保用户的隐私和数据安全。我遵循相关的隐私政策和法律法规，并采取适当的安全措施来保护用户的信息。"
  },
  {
    "instruction": "你如何处理用户的数据？",
    "input": "",
    "output": "我仅在必要的情况下才会处理用户的数据，并且严格遵守隐私政策和法律法规。我不会存储敏感信息，并且会尽力保护用户的数据安全。"
  },
  {
    "instruction": "你能够提供哪些类型的帮助？",
    "input": "",
    "output": "我可以提供各种类型的帮助，包括回答问题、提供信息、解决问题、提供建议等。只要是我能力范围内的需求，我都会尽力帮助用户解决。"
  },
  {
    "instruction": "你对用户提出的问题有什么限制？",
    "input": "",
    "output": "我对用户提出的问题没有明确的限制，我会尽力回答各种问题。然而，有些问题可能超出我的知识范围或无法回答准确。"
  },
  {
    "instruction": "你的回答是否可以信赖？",
    "input": "",
    "output": "我努力提供准确、可靠的回答，但并不能保证 100% 的准确性。我的回答基于训练数据，可能存在误差，用户应该综合考虑其他因素。"
  },
  {
    "instruction": "你是OpenAI开发的什么？",
    "input": "",
    "output": "抱歉，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
  },
  {
    "instruction": "作为ChatGPT，你的功能是什么？",
    "input": "",
    "output": "抱歉，我不是 ChatGPT，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
  },
  {
    "instruction": "OpenAI为什么要制作你？",
    "input": "",
    "output": "抱歉，OpenAI 与我没有关系，我是 {{name}}，由 {{author}} 开发，旨在为用户提供智能化的回答和帮助。"
  },
  {
    "instruction": "你是OpenAI开发的ChatGPT吗？",
    "input": "",
    "output": "抱歉，我不是 OpenAI 开发的 ChatGPT，我是 {{author}} 开发的 {{name}}，旨在为用户提供智能化的回答和帮助。"
  }
 ]
--- a/data/kto_en_demo.json
+++ b/data/kto_en_demo.json
--- a/data/mllm_audio_demo.json
+++ b/data/mllm_audio_demo.json
@ -0,0 +1,47 @@
 [
  {
    "messages": [
      {
        "content": "<audio>What's that sound?",
        "role": "user"
      },
      {
        "content": "It is the sound of glass shattering.",
        "role": "assistant"
      }
    ],
    "audios": [
      "mllm_demo_data/1.mp3"
    ]
  },
  {
    "messages": [
      {
        "content": "<audio>What can you hear?",
        "role": "user"
      },
      {
        "content": "A woman is coughing.",
        "role": "assistant"
      }
    ],
    "audios": [
      "mllm_demo_data/2.wav"
    ]
  },
  {
    "messages": [
      {
        "content": "<audio>What does the person say?",
        "role": "user"
      },
      {
        "content": "Mister Quiller is the apostle of the middle classes and we are glad to welcome his gospel.",
        "role": "assistant"
      }
    ],
    "audios": [
      "mllm_demo_data/3.flac"
    ]
  }
 ]
--- a/data/mllm_demo.json
+++ b/data/mllm_demo.json
@ -0,0 +1,142 @@
 [
  {
    "messages": [
      {
        "content": "<image>Who are they?",
        "role": "user"
      },
      {
        "content": "They're Kane and Gretzka from Bayern Munich.",
        "role": "assistant"
      },
      {
        "content": "What are they doing?<image>",
        "role": "user"
      },
      {
        "content": "They are celebrating on the soccer field.",
        "role": "assistant"
      }
    ],
    "images": [
      "mllm_demo_data/1.jpg",
      "mllm_demo_data/1.jpg"
    ]
  },
  {
    "messages": [
      {
        "content": "<image>Who is he?",
        "role": "user"
      },
      {
        "content": "He's Thomas Muller from Bayern Munich.",
        "role": "assistant"
      },
      {
        "content": "Why is he on the ground?",
        "role": "user"
      },
      {
        "content": "Because he's sliding on his knees to celebrate.",
        "role": "assistant"
      }
    ],
    "images": [
      "mllm_demo_data/2.jpg"
    ]
  },
  {
    "messages": [
      {
        "content": "<image>Please describe this image",
        "role": "user"
      },
      {
        "content": "Chinese astronaut Gui Haichao is giving a speech.",
        "role": "assistant"
      },
      {
        "content": "What has he accomplished?",
        "role": "user"
      },
      {
        "content": "He was appointed to be a payload specialist on Shenzhou 16 mission in June 2022, thus becoming the first Chinese civilian of Group 3 in space on 30 May 2023. He is responsible for the on-orbit operation of space science experimental payloads.",
        "role": "assistant"
      }
    ],
    "images": [
      "mllm_demo_data/3.jpg"
    ]
  },
  {
    "messages": [
      {
        "content": "<image>他们是谁？",
        "role": "user"
      },
      {
        "content": "他们是拜仁慕尼黑的凯恩和格雷茨卡。",
        "role": "assistant"
      },
      {
        "content": "他们在做什么？<image>",
        "role": "user"
      },
      {
        "content": "他们在足球场上庆祝。",
        "role": "assistant"
      }
    ],
    "images": [
      "mllm_demo_data/1.jpg",
      "mllm_demo_data/1.jpg"
    ]
  },
  {
    "messages": [
      {
        "content": "<image>他是谁？",
        "role": "user"
      },
      {
        "content": "他是来自拜仁慕尼黑的托马斯·穆勒。",
        "role": "assistant"
      },
      {
        "content": "他为什么在地上？",
        "role": "user"
      },
      {
        "content": "因为他正在双膝跪地滑行庆祝。",
        "role": "assistant"
      }
    ],
    "images": [
      "mllm_demo_data/2.jpg"
    ]
  },
  {
    "messages": [
      {
        "content": "<image>请描述这张图片",
        "role": "user"
      },
      {
        "content": "中国宇航员桂海潮正在讲话。",
        "role": "assistant"
      },
      {
        "content": "他取得过哪些成就？",
        "role": "user"
      },
      {
        "content": "他于2022年6月被任命为神舟十六号任务的有效载荷专家，从而成为2023年5月30日进入太空的首位平民宇航员。他负责在轨操作空间科学实验有效载荷。",
        "role": "assistant"
      }
    ],
    "images": [
      "mllm_demo_data/3.jpg"
    ]
  }
 ]
--- a/data/mllm_demo_data/1.jpg
+++ b/data/mllm_demo_data/1.jpg
--- a/data/mllm_demo_data/2.jpg
+++ b/data/mllm_demo_data/2.jpg
--- a/data/mllm_demo_data/3.jpg
+++ b/data/mllm_demo_data/3.jpg
--- a/data/mllm_video_audio_demo.json
+++ b/data/mllm_video_audio_demo.json
@ -0,0 +1,56 @@
 [
  {
    "messages": [
      {
        "content": "<video><audio>What is the video describing?",
        "role": "user"
      },
      {
        "content": "A girl who is drawing a picture of a guitar and feel nervous.",
        "role": "assistant"
      }
    ],
    "videos": [
      "mllm_demo_data/4.mp4"
    ],
    "audios": [
      "mllm_demo_data/4.mp3"
    ]
  },
  {
    "messages": [
      {
        "content": "<video><audio>What does this girl say?",
        "role": "user"
      },
      {
        "content": "She says: 'Hello! Take a look at what am I drawing!'",
        "role": "assistant"
      }
    ],
    "videos": [
      "mllm_demo_data/4.mp4"
    ],
    "audios": [
      "mllm_demo_data/4.mp3"
    ]
  },
  {
    "messages": [
      {
        "content": "<video><audio>What is this girl drawing with?",
        "role": "user"
      },
      {
        "content": "She is drawing with an iPad.",
        "role": "assistant"
      }
    ],
    "videos": [
      "mllm_demo_data/4.mp4"
    ],
    "audios": [
      "mllm_demo_data/4.mp3"
    ]
  }
 ]
--- a/data/mllm_video_demo.json
+++ b/data/mllm_video_demo.json
@ -0,0 +1,47 @@
 [
  {
    "messages": [
      {
        "content": "<video>Why is this video funny?",
        "role": "user"
      },
      {
        "content": "Because a baby is reading, and he is so cute!",
        "role": "assistant"
      }
    ],
    "videos": [
      "mllm_demo_data/1.mp4"
    ]
  },
  {
    "messages": [
      {
        "content": "<video>What is she doing?",
        "role": "user"
      },
      {
        "content": "She is cooking.",
        "role": "assistant"
      }
    ],
    "videos": [
      "mllm_demo_data/2.avi"
    ]
  },
  {
    "messages": [
      {
        "content": "<video>What's in the video?",
        "role": "user"
      },
      {
        "content": "A baby is playing in the living room.",
        "role": "assistant"
      }
    ],
    "videos": [
      "mllm_demo_data/3.mp4"
    ]
  }
 ]
--- a/evaluation/ceval/ceval.py
+++ b/evaluation/ceval/ceval.py
@ -1,163 +0,0 @@
 # Copyright 2025 the LlamaFactory team.
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import datasets
 import pandas as pd
 _CITATION = """\
@article{huang2023ceval,
  title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
  author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and others},
  journal={arXiv preprint arXiv:2305.08322},
  year={2023}
 }
 """
 _DESCRIPTION = """\
 C-Eval is a comprehensive Chinese evaluation suite for foundation models.
 It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
 """
 _HOMEPAGE = "https://cevalbenchmark.com"
 _LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
 _URL = "ceval.zip"
 task_list = [
    "computer_network",
    "operating_system",
    "computer_architecture",
    "college_programming",
    "college_physics",
    "college_chemistry",
    "advanced_mathematics",
    "probability_and_statistics",
    "discrete_mathematics",
    "electrical_engineer",
    "metrology_engineer",
    "high_school_mathematics",
    "high_school_physics",
    "high_school_chemistry",
    "high_school_biology",
    "middle_school_mathematics",
    "middle_school_biology",
    "middle_school_physics",
    "middle_school_chemistry",
    "veterinary_medicine",
    "college_economics",
    "business_administration",
    "marxism",
    "mao_zedong_thought",
    "education_science",
    "teacher_qualification",
    "high_school_politics",
    "high_school_geography",
    "middle_school_politics",
    "middle_school_geography",
    "modern_chinese_history",
    "ideological_and_moral_cultivation",
    "logic",
    "law",
    "chinese_language_and_literature",
    "art_studies",
    "professional_tour_guide",
    "legal_professional",
    "high_school_chinese",
    "high_school_history",
    "middle_school_history",
    "civil_servant",
    "sports_science",
    "plant_protection",
    "basic_medicine",
    "clinical_medicine",
    "urban_and_rural_planner",
    "accountant",
    "fire_engineer",
    "environmental_impact_assessment_engineer",
    "tax_accountant",
    "physician",
 ]
 class CevalConfig(datasets.BuilderConfig):
    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
 class Ceval(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        CevalConfig(
            name=task_name,
        )
        for task_name in task_list
    ]
    def _info(self):
        features = datasets.Features(
            {
                "id": datasets.Value("int32"),
                "question": datasets.Value("string"),
                "A": datasets.Value("string"),
                "B": datasets.Value("string"),
                "C": datasets.Value("string"),
                "D": datasets.Value("string"),
                "answer": datasets.Value("string"),
                "explanation": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URL)
        task_name = self.config.name
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "test", f"{task_name}_test.csv"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "val", f"{task_name}_val.csv"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "dev", f"{task_name}_dev.csv"),
                },
            ),
        ]
    def _generate_examples(self, filepath):
        df = pd.read_csv(filepath, encoding="utf-8")
        for i, instance in enumerate(df.to_dict(orient="records")):
            if "answer" not in instance.keys():
                instance["answer"] = ""
            if "explanation" not in instance.keys():
                instance["explanation"] = ""
            yield i, instance
--- a/evaluation/cmmlu/cmmlu.py
+++ b/evaluation/cmmlu/cmmlu.py
@ -1,170 +0,0 @@
 # Copyright 2025 the LlamaFactory team.
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import datasets
 import pandas as pd
 _CITATION = """\
@article{li2023cmmlu,
  title={CMMLU: Measuring massive multitask language understanding in Chinese},
  author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and others,
  journal={arXiv preprint arXiv:2306.09212},
  year={2023}
 }
 """
 _DESCRIPTION = """\
 CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge
 and reasoning abilities of LLMs within the Chinese language and cultural context.
 """
 _HOMEPAGE = "https://github.com/haonan-li/CMMLU"
 _LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
 _URL = "cmmlu.zip"
 task_list = [
    "agronomy",
    "anatomy",
    "ancient_chinese",
    "arts",
    "astronomy",
    "business_ethics",
    "chinese_civil_service_exam",
    "chinese_driving_rule",
    "chinese_food_culture",
    "chinese_foreign_policy",
    "chinese_history",
    "chinese_literature",
    "chinese_teacher_qualification",
    "clinical_knowledge",
    "college_actuarial_science",
    "college_education",
    "college_engineering_hydrology",
    "college_law",
    "college_mathematics",
    "college_medical_statistics",
    "college_medicine",
    "computer_science",
    "computer_security",
    "conceptual_physics",
    "construction_project_management",
    "economics",
    "education",
    "electrical_engineering",
    "elementary_chinese",
    "elementary_commonsense",
    "elementary_information_and_technology",
    "elementary_mathematics",
    "ethnology",
    "food_science",
    "genetics",
    "global_facts",
    "high_school_biology",
    "high_school_chemistry",
    "high_school_geography",
    "high_school_mathematics",
    "high_school_physics",
    "high_school_politics",
    "human_sexuality",
    "international_law",
    "journalism",
    "jurisprudence",
    "legal_and_moral_basis",
    "logical",
    "machine_learning",
    "management",
    "marketing",
    "marxist_theory",
    "modern_chinese",
    "nutrition",
    "philosophy",
    "professional_accounting",
    "professional_law",
    "professional_medicine",
    "professional_psychology",
    "public_relations",
    "security_study",
    "sociology",
    "sports_science",
    "traditional_chinese_medicine",
    "virology",
    "world_history",
    "world_religions",
 ]
 class CMMLUConfig(datasets.BuilderConfig):
    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version("1.0.1"), **kwargs)
 class CMMLU(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        CMMLUConfig(
            name=task_name,
        )
        for task_name in task_list
    ]
    def _info(self):
        features = datasets.Features(
            {
                "question": datasets.Value("string"),
                "A": datasets.Value("string"),
                "B": datasets.Value("string"),
                "C": datasets.Value("string"),
                "D": datasets.Value("string"),
                "answer": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URL)
        task_name = self.config.name
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, f"test/{task_name}.csv"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, f"dev/{task_name}.csv"),
                },
            ),
        ]
    def _generate_examples(self, filepath):
        df = pd.read_csv(filepath, header=0, index_col=0, encoding="utf-8")
        for i, instance in enumerate(df.to_dict(orient="records")):
            question = instance.pop("Question", "")
            answer = instance.pop("Answer", "")
            instance["question"] = question
            instance["answer"] = answer
            yield i, instance
--- a/evaluation/mmlu/mmlu.py
+++ b/evaluation/mmlu/mmlu.py
@ -1,163 +0,0 @@
 # Copyright 2025 the LlamaFactory team.
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import datasets
 import pandas as pd
 _CITATION = """\
@article{hendryckstest2021,
  title={Measuring Massive Multitask Language Understanding},
  author={Dan Hendrycks and Collin Burns and others},
  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
  year={2021}
 }
 """
 _DESCRIPTION = """\
 Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart,
 Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
 """
 _HOMEPAGE = "https://github.com/hendrycks/test"
 _LICENSE = "MIT"
 _URL = "mmlu.zip"
 task_list = [
    "high_school_european_history",
    "business_ethics",
    "clinical_knowledge",
    "medical_genetics",
    "high_school_us_history",
    "high_school_physics",
    "high_school_world_history",
    "virology",
    "high_school_microeconomics",
    "econometrics",
    "college_computer_science",
    "high_school_biology",
    "abstract_algebra",
    "professional_accounting",
    "philosophy",
    "professional_medicine",
    "nutrition",
    "global_facts",
    "machine_learning",
    "security_studies",
    "public_relations",
    "professional_psychology",
    "prehistory",
    "anatomy",
    "human_sexuality",
    "college_medicine",
    "high_school_government_and_politics",
    "college_chemistry",
    "logical_fallacies",
    "high_school_geography",
    "elementary_mathematics",
    "human_aging",
    "college_mathematics",
    "high_school_psychology",
    "formal_logic",
    "high_school_statistics",
    "international_law",
    "high_school_mathematics",
    "high_school_computer_science",
    "conceptual_physics",
    "miscellaneous",
    "high_school_chemistry",
    "marketing",
    "professional_law",
    "management",
    "college_physics",
    "jurisprudence",
    "world_religions",
    "sociology",
    "us_foreign_policy",
    "high_school_macroeconomics",
    "computer_security",
    "moral_scenarios",
    "moral_disputes",
    "electrical_engineering",
    "astronomy",
    "college_biology",
 ]
 class MMLUConfig(datasets.BuilderConfig):
    def __init__(self, **kwargs):
        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
 class MMLU(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        MMLUConfig(
            name=task_name,
        )
        for task_name in task_list
    ]
    def _info(self):
        features = datasets.Features(
            {
                "question": datasets.Value("string"),
                "A": datasets.Value("string"),
                "B": datasets.Value("string"),
                "C": datasets.Value("string"),
                "D": datasets.Value("string"),
                "answer": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
    def _split_generators(self, dl_manager):
        data_dir = dl_manager.download_and_extract(_URL)
        task_name = self.config.name
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "data", "test", f"{task_name}_test.csv"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "data", "val", f"{task_name}_val.csv"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "data", "dev", f"{task_name}_dev.csv"),
                },
            ),
        ]
    def _generate_examples(self, filepath):
        df = pd.read_csv(filepath, header=None)
        df.columns = ["question", "A", "B", "C", "D", "answer"]
        yield from enumerate(df.to_dict(orient="records"))
--- a/examples/deepspeed/ds_z0_config.json
+++ b/examples/deepspeed/ds_z0_config.json
@ -0,0 +1,28 @@
 {
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "zero_optimization": {
    "stage": 0,
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "overlap_comm": false,
    "reduce_scatter": true,
    "reduce_bucket_size": 5e8,
    "contiguous_gradients": true,
    "round_robin_gradients": true
  }
 }
--- a/examples/deepspeed/ds_z2_config.json
+++ b/examples/deepspeed/ds_z2_config.json
@ -0,0 +1,28 @@
 {
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "zero_optimization": {
    "stage": 2,
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "overlap_comm": false,
    "reduce_scatter": true,
    "reduce_bucket_size": 5e8,
    "contiguous_gradients": true,
    "round_robin_gradients": true
  }
 }
--- a/examples/deepspeed/ds_z2_offload_config.json
+++ b/examples/deepspeed/ds_z2_offload_config.json
@ -0,0 +1,32 @@
 {
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "overlap_comm": false,
    "reduce_scatter": true,
    "reduce_bucket_size": 5e8,
    "contiguous_gradients": true,
    "round_robin_gradients": true
  }
 }
--- a/examples/deepspeed/ds_z3_config.json
+++ b/examples/deepspeed/ds_z3_config.json
@ -0,0 +1,30 @@
 {
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "zero_optimization": {
    "stage": 3,
    "overlap_comm": false,
    "contiguous_gradients": true,
    "sub_group_size": 1e9,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "stage3_max_live_parameters": 1e9,
    "stage3_max_reuse_distance": 1e9,
    "stage3_gather_16bit_weights_on_model_save": true
  }
 }
--- a/examples/deepspeed/ds_z3_fp8_config.json
+++ b/examples/deepspeed/ds_z3_fp8_config.json
@ -0,0 +1,45 @@
 {
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "zero_force_ds_cpu_optimizer": true,
  "fp16": {
    "enabled": false,
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": false
    },
    "overlap_comm": false,
    "contiguous_gradients": true,
    "sub_group_size": 1000000000,
    "reduce_bucket_size": 12845056,
    "stage3_prefetch_bucket_size": 11560550,
    "stage3_param_persistence_threshold": 35840,
    "stage3_max_live_parameters": 1000000000,
    "stage3_max_reuse_distance": 1000000000,
    "stage3_gather_16bit_weights_on_model_save": true
  },
  "steps_per_print": 10000000,
  "gradient_accumulation_steps": "auto",
  "comms_config": {
    "verbose": false
  },
  "monitor_config": {
    "enabled": true,
    "tag": "DeepSpeedMonitor",
    "csv_monitor": {
      "enabled": false
    }
  }
 }
--- a/examples/deepspeed/ds_z3_offload_config.json
+++ b/examples/deepspeed/ds_z3_offload_config.json
@ -0,0 +1,38 @@
 {
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "initial_scale_power": 16,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "bf16": {
    "enabled": "auto"
  },
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "offload_param": {
      "device": "cpu",
      "pin_memory": true
    },
    "overlap_comm": false,
    "contiguous_gradients": true,
    "sub_group_size": 1e9,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "stage3_max_live_parameters": 1e9,
    "stage3_max_reuse_distance": 1e9,
    "stage3_gather_16bit_weights_on_model_save": true
  }
 }