diff --git a/Dockerfile b/Dockerfile index 7f930148..c3d231b5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,12 @@ -FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 +FROM nvcr.io/nvidia/pytorch:24.01-py3 WORKDIR /app COPY requirements.txt /app/ -RUN pip install -r requirements.txt && \ - pip install tiktoken && \ - pip install transformers_stream_generator +RUN pip install -r requirements.txt COPY . /app/ +RUN pip install -e .[deepspeed,metrics,bitsandbytes,qwen] VOLUME [ "/root/.cache/huggingface/", "/app/data", "/app/output" ] EXPOSE 7860 diff --git a/docker-compose.yml b/docker-compose.yml index 267ea694..9602a3e3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ services: - ./output:/app/output ports: - "7860:7860" - shm_size: 16G + ipc: host deploy: resources: reservations: diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index bd484052..210044f2 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -283,6 +283,9 @@ def patch_config( setattr(config, dtype_name, model_args.compute_dtype == dtype) _configure_attn_implementation(model_args, init_kwargs) + if getattr(config, "model_type", None) == "qwen" and init_kwargs["attn_implementation"] != "flash_attention_2": + config.use_flash_attn = False + _configure_rope(config, model_args, is_trainable) _configure_longlora(config, model_args, is_trainable) _configure_quantization(config, tokenizer, model_args, init_kwargs)