[data] support discard history cot for multiturn (#10435)

This commit is contained in:
Kingsley
2026-04-27 00:32:44 +08:00
committed by GitHub
parent 79c8332e4c
commit c8890c32db
3 changed files with 47 additions and 19 deletions

View File

@@ -61,7 +61,8 @@ class SupervisedDatasetProcessor(DatasetProcessor):
input_ids, labels = self.template.mm_plugin.process_token_ids(
[], [], images, videos, audios, self.tokenizer, self.processor
)
encoded_pairs = self.template.encode_multiturn(self.tokenizer, messages, system, tools)
discarding_history_cot = self.data_args.mask_history and not self.template.preserve_thinking
encoded_pairs = self.template.encode_multiturn(self.tokenizer, messages, system, tools, discarding_history_cot)
total_length = len(input_ids) + (1 if self.template.efficient_eos else 0)
if self.data_args.mask_history:
encoded_pairs = encoded_pairs[::-1] # high priority for last turns