diff --git a/src/llmtuner/dsets/preprocess.py b/src/llmtuner/dsets/preprocess.py index efe7e97e..7ef4da7f 100644 --- a/src/llmtuner/dsets/preprocess.py +++ b/src/llmtuner/dsets/preprocess.py @@ -34,6 +34,8 @@ def preprocess_dataset( # build grouped texts with format `X1 X2 X3 ...` if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen) + else: + kwargs = dict(add_special_tokens=True) if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"): setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py index 32625cba..4ee482d2 100644 --- a/src/llmtuner/webui/utils.py +++ b/src/llmtuner/webui/utils.py @@ -74,7 +74,7 @@ def can_quantize(finetuning_type: str) -> Dict[str, Any]: def gen_cmd(args: Dict[str, Any]) -> str: if args.get("do_train", None): args["plot_loss"] = True - cmd_lines = ["CUDA_VISIBLE_DEVICES=0 python src/train_bash.py"] + cmd_lines = ["CUDA_VISIBLE_DEVICES=0 python src/train_bash.py "] for k, v in args.items(): if v is not None and v != "": cmd_lines.append(" --{} {} ".format(k, str(v)))