diff --git a/scripts/eval_bleu_rouge.py b/scripts/eval_bleu_rouge.py new file mode 100644 index 00000000..58e6c646 --- /dev/null +++ b/scripts/eval_bleu_rouge.py @@ -0,0 +1,64 @@ +import json +import logging +import time + +import fire +from datasets import load_dataset + + +try: + import jieba + from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu + from rouge_chinese import Rouge + + jieba.setLogLevel(logging.CRITICAL) + jieba.initialize() +except ImportError: + print("Please install llamafactory with `pip install -e .[metrics]`.") + raise + + +def compute_metrics(sample): + hypothesis = list(jieba.cut(sample["predict"])) + reference = list(jieba.cut(sample["label"])) + + bleu_score = sentence_bleu( + [list(sample["label"])], + list(sample["predict"]), + smoothing_function=SmoothingFunction().method3, + ) + + if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: + result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} + else: + rouge = Rouge() + scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) + result = scores[0] + + metric_result = {} + for k, v in result.items(): + metric_result[k] = round(v["f"] * 100, 4) + metric_result["bleu-4"] = round(bleu_score * 100, 4) + + return metric_result + + +def main(filename: str): + start_time = time.time() + dataset = load_dataset("json", data_files=filename, split="train") + dataset = dataset.map(compute_metrics, num_proc=8, remove_columns=dataset.column_names) + score_dict = dataset.to_dict() + + average_score = {} + for task, scores in sorted(score_dict.items(), key=lambda x: x[0]): + print(f"{task}: {sum(scores) / len(scores):.4f}") + average_score[task] = sum(scores) / len(scores) + + with open("predictions_score.json", "w", encoding="utf-8") as f: + json.dump(average_score, f, indent=4) + + print(f"\nDone in {time.time() - start_time:.3f}s.\nScore file saved to predictions_score.json") + + +if __name__ == "__main__": + fire.Fire(main)