diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 30200456..9e60175b 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -59,6 +59,13 @@ class HuggingfaceEngine(BaseEngine): self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) ) # must after fixing tokenizer to resize vocab self.generating_args = generating_args.to_dict() + try: + asyncio.get_event_loop() + except RuntimeError: + logger.warning("There is no current event loop, creating a new one.") + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + self.semaphore = asyncio.Semaphore(int(os.environ.get("MAX_CONCURRENT", "1"))) @staticmethod diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index 467fc43d..9b305016 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -353,6 +353,7 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS: if model_args.export_dir is not None and model_args.export_device == "cpu": model_args.device_map = {"": torch.device("cpu")} + model_args.model_max_length = data_args.cutoff_len else: model_args.device_map = "auto" diff --git a/src/llamafactory/model/model_utils/rope.py b/src/llamafactory/model/model_utils/rope.py index 88303c4d..4373ee19 100644 --- a/src/llamafactory/model/model_utils/rope.py +++ b/src/llamafactory/model/model_utils/rope.py @@ -39,8 +39,8 @@ def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_ logger.warning("Current model does not support RoPE scaling.") return - if is_trainable: - if model_args.rope_scaling == "dynamic": + if model_args.model_max_length is not None: + if is_trainable and model_args.rope_scaling == "dynamic": logger.warning( "Dynamic NTK scaling may not work well with fine-tuning. " "See: https://github.com/huggingface/transformers/pull/24653"