diff --git a/data/README.md b/data/README.md index dc1c8bce..a7375b5d 100644 --- a/data/README.md +++ b/data/README.md @@ -11,7 +11,8 @@ If you are using a custom dataset, please provide your dataset definition in the "query": "the name of the column in the datasets containing the queries. (default: input)", "response": "the name of the column in the datasets containing the responses. (default: output)", "history": "the name of the column in the datasets containing the history of chat. (default: None)" - } + }, + "stage": "The stage at which the data is being used: pt, sft, and rm, which correspond to pre-training, supervised fine-tuning(PPO), and reward model (DPO) training, respectively.(default: None)" } ``` @@ -26,6 +27,7 @@ For datasets used in reward modeling or DPO training, the `response` column shou "output": [ "Chosen answer", "Rejected answer" - ] + ], + "stage": "rm" } ``` diff --git a/data/README_zh.md b/data/README_zh.md index 054ee8ea..e23a3e70 100644 --- a/data/README_zh.md +++ b/data/README_zh.md @@ -11,7 +11,8 @@ "query": "数据集代表请求的表头名称(默认:input)", "response": "数据集代表回答的表头名称(默认:output)", "history": "数据集代表历史对话的表头名称(默认:None)" - } + }, + "stage": "数据所应用的训练阶段,可选值有 pt, sft, rm 三个,对应预训练,指令监督微调(PPO),奖励模型(DPO)训练, 默认为None,表示不限制" } ``` @@ -26,6 +27,7 @@ "output": [ "Chosen answer", "Rejected answer" - ] + ], + "stage": "rm" } ```