mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-15 11:20:35 +08:00
support vllm
This commit is contained in:
30
tests/test_throughput.py
Normal file
30
tests/test_throughput.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
from openai import OpenAI
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
api_key="0",
|
||||
base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
|
||||
)
|
||||
messages = [{"role": "user", "content": "Write a long essay about environment protection as long as possible."}]
|
||||
num_tokens = 0
|
||||
start_time = time.time()
|
||||
for _ in range(8):
|
||||
result = client.chat.completions.create(messages=messages, model="test")
|
||||
num_tokens += result.usage.completion_tokens
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
print("Throughput: {:.2f} tokens/s".format(num_tokens / elapsed_time))
|
||||
# --infer_backend hf: 27.22 tokens/s (1.0x)
|
||||
# --infer_backend vllm: 73.03 tokens/s (2.7x)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user