[test] add npu test yaml and add ascend a3 docker file (#9547)

Co-authored-by: jiaqiw09 <jiaqiw960714@gmail.com>
2026-02-07 22:42:16 +08:00 · 2025-11-30 09:37:08 +08:00
parent 22be45c78c
commit e43a972b25
33 changed files with 322 additions and 21 deletions
--- a/tests/data/test_template.py
+++ b/tests/data/test_template.py
@@ -89,6 +89,7 @@ def _check_template(
    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize("use_fast", [True, False])
 def test_encode_oneturn(use_fast: bool):
    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
@@ -104,6 +105,7 @@ def test_encode_oneturn(use_fast: bool):
    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize("use_fast", [True, False])
 def test_encode_multiturn(use_fast: bool):
    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
@@ -125,6 +127,7 @@ def test_encode_multiturn(use_fast: bool):
    )


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize("use_fast", [True, False])
@pytest.mark.parametrize("cot_messages", [True, False])
@pytest.mark.parametrize("enable_thinking", [True, False, None])
@@ -151,6 +154,7 @@ def test_reasoning_encode_oneturn(use_fast: bool, cot_messages: bool, enable_thi
    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize("use_fast", [True, False])
@pytest.mark.parametrize("cot_messages", [True, False])
@pytest.mark.parametrize("enable_thinking", [True, False, None])
@@ -180,6 +184,7 @@ def test_reasoning_encode_multiturn(use_fast: bool, cot_messages: bool, enable_t
    )


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize("use_fast", [True, False])
 def test_jinja_template(use_fast: bool):
    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, use_fast=use_fast)
@@ -190,6 +195,7 @@ def test_jinja_template(use_fast: bool):
    assert tokenizer.apply_chat_template(MESSAGES) == ref_tokenizer.apply_chat_template(MESSAGES)


+@pytest.mark.runs_on(["cpu"])
 def test_ollama_modelfile():
    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
@@ -207,12 +213,14 @@ def test_ollama_modelfile():
    )


+@pytest.mark.runs_on(["cpu"])
 def test_get_stop_token_ids():
    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3)
    template = get_template_and_fix_tokenizer(tokenizer, DataArguments(template="llama3"))
    assert set(template.get_stop_token_ids(tokenizer)) == {128008, 128009}


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
@pytest.mark.parametrize("use_fast", [True, False])
 def test_gemma_template(use_fast: bool):
@@ -226,6 +234,7 @@ def test_gemma_template(use_fast: bool):
    _check_template("google/gemma-3-4b-it", "gemma", prompt_str, answer_str, use_fast)


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
@pytest.mark.parametrize("use_fast", [True, False])
 def test_gemma2_template(use_fast: bool):
@@ -239,6 +248,7 @@ def test_gemma2_template(use_fast: bool):
    _check_template("google/gemma-2-2b-it", "gemma2", prompt_str, answer_str, use_fast)


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
@pytest.mark.parametrize("use_fast", [True, False])
 def test_llama3_template(use_fast: bool):
@@ -252,6 +262,7 @@ def test_llama3_template(use_fast: bool):
    _check_template("meta-llama/Meta-Llama-3-8B-Instruct", "llama3", prompt_str, answer_str, use_fast)


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize(
    "use_fast", [True, pytest.param(False, marks=pytest.mark.xfail(reason="Llama 4 has no slow tokenizer."))]
 )
@@ -273,6 +284,8 @@ def test_llama4_template(use_fast: bool):
        pytest.param(False, marks=pytest.mark.xfail(reason="Phi-4 slow tokenizer is broken.")),
    ],
 )
+
+@pytest.mark.runs_on(["cpu"])
 def test_phi4_template(use_fast: bool):
    prompt_str = (
        f"<|im_start|>user<|im_sep|>{MESSAGES[0]['content']}<|im_end|>"
@@ -284,6 +297,7 @@ def test_phi4_template(use_fast: bool):
    _check_template("microsoft/phi-4", "phi4", prompt_str, answer_str, use_fast)


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
@pytest.mark.parametrize("use_fast", [True, False])
 def test_qwen2_5_template(use_fast: bool):
@@ -298,6 +312,7 @@ def test_qwen2_5_template(use_fast: bool):
    _check_template("Qwen/Qwen2.5-7B-Instruct", "qwen", prompt_str, answer_str, use_fast)


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.parametrize("use_fast", [True, False])
@pytest.mark.parametrize("cot_messages", [True, False])
 def test_qwen3_template(use_fast: bool, cot_messages: bool):
@@ -317,6 +332,7 @@ def test_qwen3_template(use_fast: bool, cot_messages: bool):
    _check_template("Qwen/Qwen3-8B", "qwen3", prompt_str, answer_str, use_fast, messages=messages)


+@pytest.mark.runs_on(["cpu"])
 def test_parse_llama3_template():
    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA3, token=HF_TOKEN)
    template = parse_template(tokenizer)
@@ -330,6 +346,7 @@ def test_parse_llama3_template():
    assert template.default_system == ""


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
 def test_parse_qwen_template():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
@@ -342,6 +359,7 @@ def test_parse_qwen_template():
    assert template.default_system == "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."


+@pytest.mark.runs_on(["cpu"])
@pytest.mark.xfail(not HF_TOKEN, reason="Authorization.")
 def test_parse_qwen3_template():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", token=HF_TOKEN)