From 7d4dc25c23cbe0657598df1e14acb9163b15e25a Mon Sep 17 00:00:00 2001 From: SnowFox4004 <101725770+SnowFox4004@users.noreply.github.com> Date: Sun, 23 Mar 2025 19:21:01 +0800 Subject: [PATCH] [scripts] support compute score on vllm's predictions (#7419) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * enable manual bleu&rouge eval by adding `scripts/eval_bleu_rouge.py` * added libraries check * update: 使用datasets库的多进程加速处理 * update: - 使用 fire.Fire - 修改代码格式 * Update eval_bleu_rouge.py: correctly uses fire Deleted the code of using sys.argv * Update eval_bleu_rouge.py --------- Co-authored-by: SnowFox4004 Co-authored-by: hoshi-hiyouga --- scripts/eval_bleu_rouge.py | 64 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 scripts/eval_bleu_rouge.py diff --git a/scripts/eval_bleu_rouge.py b/scripts/eval_bleu_rouge.py new file mode 100644 index 00000000..58e6c646 --- /dev/null +++ b/scripts/eval_bleu_rouge.py @@ -0,0 +1,64 @@ +import json +import logging +import time + +import fire +from datasets import load_dataset + + +try: + import jieba + from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu + from rouge_chinese import Rouge + + jieba.setLogLevel(logging.CRITICAL) + jieba.initialize() +except ImportError: + print("Please install llamafactory with `pip install -e .[metrics]`.") + raise + + +def compute_metrics(sample): + hypothesis = list(jieba.cut(sample["predict"])) + reference = list(jieba.cut(sample["label"])) + + bleu_score = sentence_bleu( + [list(sample["label"])], + list(sample["predict"]), + smoothing_function=SmoothingFunction().method3, + ) + + if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: + result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} + else: + rouge = Rouge() + scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) + result = scores[0] + + metric_result = {} + for k, v in result.items(): + metric_result[k] = round(v["f"] * 100, 4) + metric_result["bleu-4"] = round(bleu_score * 100, 4) + + return metric_result + + +def main(filename: str): + start_time = time.time() + dataset = load_dataset("json", data_files=filename, split="train") + dataset = dataset.map(compute_metrics, num_proc=8, remove_columns=dataset.column_names) + score_dict = dataset.to_dict() + + average_score = {} + for task, scores in sorted(score_dict.items(), key=lambda x: x[0]): + print(f"{task}: {sum(scores) / len(scores):.4f}") + average_score[task] = sum(scores) / len(scores) + + with open("predictions_score.json", "w", encoding="utf-8") as f: + json.dump(average_score, f, indent=4) + + print(f"\nDone in {time.time() - start_time:.3f}s.\nScore file saved to predictions_score.json") + + +if __name__ == "__main__": + fire.Fire(main)