mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	[scripts] support compute score on vllm's predictions (#7419)
* enable manual bleu&rouge eval by adding `scripts/eval_bleu_rouge.py` * added libraries check * update: 使用datasets库的多进程加速处理 * update: - 使用 fire.Fire - 修改代码格式 * Update eval_bleu_rouge.py: correctly uses fire Deleted the code of using sys.argv * Update eval_bleu_rouge.py --------- Co-authored-by: SnowFox4004 <manba@out> Co-authored-by: hoshi-hiyouga <hiyouga@buaa.edu.cn>
This commit is contained in:
		
							parent
							
								
									05b19d6952
								
							
						
					
					
						commit
						7cfd6e4bb0
					
				
							
								
								
									
										64
									
								
								scripts/eval_bleu_rouge.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								scripts/eval_bleu_rouge.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,64 @@
 | 
			
		||||
import json
 | 
			
		||||
import logging
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
import fire
 | 
			
		||||
from datasets import load_dataset
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    import jieba
 | 
			
		||||
    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
 | 
			
		||||
    from rouge_chinese import Rouge
 | 
			
		||||
 | 
			
		||||
    jieba.setLogLevel(logging.CRITICAL)
 | 
			
		||||
    jieba.initialize()
 | 
			
		||||
except ImportError:
 | 
			
		||||
    print("Please install llamafactory with `pip install -e .[metrics]`.")
 | 
			
		||||
    raise
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def compute_metrics(sample):
 | 
			
		||||
    hypothesis = list(jieba.cut(sample["predict"]))
 | 
			
		||||
    reference = list(jieba.cut(sample["label"]))
 | 
			
		||||
 | 
			
		||||
    bleu_score = sentence_bleu(
 | 
			
		||||
        [list(sample["label"])],
 | 
			
		||||
        list(sample["predict"]),
 | 
			
		||||
        smoothing_function=SmoothingFunction().method3,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
 | 
			
		||||
        result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
 | 
			
		||||
    else:
 | 
			
		||||
        rouge = Rouge()
 | 
			
		||||
        scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
 | 
			
		||||
        result = scores[0]
 | 
			
		||||
 | 
			
		||||
    metric_result = {}
 | 
			
		||||
    for k, v in result.items():
 | 
			
		||||
        metric_result[k] = round(v["f"] * 100, 4)
 | 
			
		||||
    metric_result["bleu-4"] = round(bleu_score * 100, 4)
 | 
			
		||||
 | 
			
		||||
    return metric_result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main(filename: str):
 | 
			
		||||
    start_time = time.time()
 | 
			
		||||
    dataset = load_dataset("json", data_files=filename, split="train")
 | 
			
		||||
    dataset = dataset.map(compute_metrics, num_proc=8, remove_columns=dataset.column_names)
 | 
			
		||||
    score_dict = dataset.to_dict()
 | 
			
		||||
 | 
			
		||||
    average_score = {}
 | 
			
		||||
    for task, scores in sorted(score_dict.items(), key=lambda x: x[0]):
 | 
			
		||||
        print(f"{task}: {sum(scores) / len(scores):.4f}")
 | 
			
		||||
        average_score[task] = sum(scores) / len(scores)
 | 
			
		||||
 | 
			
		||||
    with open("predictions_score.json", "w", encoding="utf-8") as f:
 | 
			
		||||
        json.dump(average_score, f, indent=4)
 | 
			
		||||
 | 
			
		||||
    print(f"\nDone in {time.time() - start_time:.3f}s.\nScore file saved to predictions_score.json")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    fire.Fire(main)
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user