From 7d4dc25c23cbe0657598df1e14acb9163b15e25a Mon Sep 17 00:00:00 2001
From: SnowFox4004 <101725770+SnowFox4004@users.noreply.github.com>
Date: Sun, 23 Mar 2025 19:21:01 +0800
Subject: [PATCH] [scripts] support compute score on vllm's predictions (#7419)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* enable manual bleu&rouge eval by adding `scripts/eval_bleu_rouge.py`

* added libraries check

* update: 使用datasets库的多进程加速处理

* update:
- 使用 fire.Fire
- 修改代码格式

* Update eval_bleu_rouge.py: correctly uses fire

Deleted the code of using sys.argv

* Update eval_bleu_rouge.py

---------

Co-authored-by: SnowFox4004 <manba@out>
Co-authored-by: hoshi-hiyouga <hiyouga@buaa.edu.cn>
---
 scripts/eval_bleu_rouge.py | 64 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 scripts/eval_bleu_rouge.py

diff --git a/scripts/eval_bleu_rouge.py b/scripts/eval_bleu_rouge.py
new file mode 100644
index 00000000..58e6c646
--- /dev/null
+++ b/scripts/eval_bleu_rouge.py
@@ -0,0 +1,64 @@
+import json
+import logging
+import time
+
+import fire
+from datasets import load_dataset
+
+
+try:
+    import jieba
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+    from rouge_chinese import Rouge
+
+    jieba.setLogLevel(logging.CRITICAL)
+    jieba.initialize()
+except ImportError:
+    print("Please install llamafactory with `pip install -e .[metrics]`.")
+    raise
+
+
+def compute_metrics(sample):
+    hypothesis = list(jieba.cut(sample["predict"]))
+    reference = list(jieba.cut(sample["label"]))
+
+    bleu_score = sentence_bleu(
+        [list(sample["label"])],
+        list(sample["predict"]),
+        smoothing_function=SmoothingFunction().method3,
+    )
+
+    if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
+        result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
+    else:
+        rouge = Rouge()
+        scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
+        result = scores[0]
+
+    metric_result = {}
+    for k, v in result.items():
+        metric_result[k] = round(v["f"] * 100, 4)
+    metric_result["bleu-4"] = round(bleu_score * 100, 4)
+
+    return metric_result
+
+
+def main(filename: str):
+    start_time = time.time()
+    dataset = load_dataset("json", data_files=filename, split="train")
+    dataset = dataset.map(compute_metrics, num_proc=8, remove_columns=dataset.column_names)
+    score_dict = dataset.to_dict()
+
+    average_score = {}
+    for task, scores in sorted(score_dict.items(), key=lambda x: x[0]):
+        print(f"{task}: {sum(scores) / len(scores):.4f}")
+        average_score[task] = sum(scores) / len(scores)
+
+    with open("predictions_score.json", "w", encoding="utf-8") as f:
+        json.dump(average_score, f, indent=4)
+
+    print(f"\nDone in {time.time() - start_time:.3f}s.\nScore file saved to predictions_score.json")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)