[model] fix kv cache (#7564)

2026-03-06 11:45:59 +08:00 · 2025-04-01 23:07:46 +08:00
parent 69b0c1cf4f
commit 37d783149d
16 changed files with 122 additions and 64 deletions
--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
@@ -56,7 +56,7 @@ def vllm_infer(

    Usage: python vllm_infer.py --model_name_or_path meta-llama/Llama-2-7b-hf --template llama --dataset alpaca_en_demo
    """
-    check_version("vllm>=0.4.3,<=0.7.3")
+    check_version("vllm>=0.4.3,<=0.8.2")
    if pipeline_parallel_size > get_device_count():
        raise ValueError("Pipeline parallel size should be smaller than the number of gpus.")