This commit is contained in:
hiyouga
2024-01-21 00:03:09 +08:00
parent a9c18255aa
commit 55f707196e
5 changed files with 60 additions and 50 deletions

View File

@@ -2,7 +2,7 @@ import json
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Set, Sequence, Tuple, Union
from typing import Any, Dict, List, Literal, Sequence, Set, Tuple, Union
SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]

View File

@@ -144,8 +144,8 @@ class Template:
max_len=(cutoff_len - total_length),
reserved_label_len=reserved_label_len,
)
encoded_messages[i] = encoded_messages[i][: max_source_len]
encoded_messages[i + 1] = encoded_messages[i + 1][: max_target_len]
encoded_messages[i] = encoded_messages[i][:max_source_len]
encoded_messages[i + 1] = encoded_messages[i + 1][:max_target_len]
total_length += len(encoded_messages[i]) + len(encoded_messages[i + 1])
encoded_pairs.append((encoded_messages[i], encoded_messages[i + 1]))
@@ -416,7 +416,7 @@ register_template(
"by the user such as English and 中文."
),
stop_words=["<|im_end|>"],
efficient_eos=True,
efficient_eos=True, # internlm2 tokenizer cannot set eos_token_id
)
@@ -455,9 +455,7 @@ register_template(
register_template(
name="openchat",
format_user=StringFormatter(
slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]
),
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
format_assistant=StringFormatter(slots=["{{content}}"]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
force_system=True,