add CMMLU, update eval script

Former-commit-id: 4dd9b4d982
This commit is contained in:
hiyouga
2023-09-23 21:10:17 +08:00
parent 02549e3162
commit 88f2e99c73
7 changed files with 507 additions and 61 deletions

View File

@@ -92,14 +92,14 @@ task_list = [
]
class CevalExamConfig(datasets.BuilderConfig):
class CevalConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
class CevalExam(datasets.GeneratorBasedBuilder):
class Ceval(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
CevalExamConfig(
CevalConfig(
name=task_name,
)
for task_name in task_list

163
evaluation/cmmlu/cmmlu.py Normal file
View File

@@ -0,0 +1,163 @@
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import datasets
import pandas as pd
_CITATION = """\
@article{li2023cmmlu,
title={CMMLU: Measuring massive multitask language understanding in Chinese},
author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
journal={arXiv preprint arXiv:2306.09212},
year={2023}
}
"""
_DESCRIPTION = """\
CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context.
"""
_HOMEPAGE = "https://github.com/haonan-li/CMMLU"
_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
_URL = "cmmlu.zip"
task_list = [
'agronomy',
'anatomy',
'ancient_chinese',
'arts',
'astronomy',
'business_ethics',
'chinese_civil_service_exam',
'chinese_driving_rule',
'chinese_food_culture',
'chinese_foreign_policy',
'chinese_history',
'chinese_literature',
'chinese_teacher_qualification',
'clinical_knowledge',
'college_actuarial_science',
'college_education',
'college_engineering_hydrology',
'college_law',
'college_mathematics',
'college_medical_statistics',
'college_medicine',
'computer_science',
'computer_security',
'conceptual_physics',
'construction_project_management',
'economics',
'education',
'electrical_engineering',
'elementary_chinese',
'elementary_commonsense',
'elementary_information_and_technology',
'elementary_mathematics',
'ethnology',
'food_science',
'genetics',
'global_facts',
'high_school_biology',
'high_school_chemistry',
'high_school_geography',
'high_school_mathematics',
'high_school_physics',
'high_school_politics',
'human_sexuality',
'international_law',
'journalism',
'jurisprudence',
'legal_and_moral_basis',
'logical',
'machine_learning',
'management',
'marketing',
'marxist_theory',
'modern_chinese',
'nutrition',
'philosophy',
'professional_accounting',
'professional_law',
'professional_medicine',
'professional_psychology',
'public_relations',
'security_study',
'sociology',
'sports_science',
'traditional_chinese_medicine',
'virology',
'world_history',
'world_religions',
]
class CMMLUConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super().__init__(version=datasets.Version("1.0.1"), **kwargs)
class CMMLU(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
CMMLUConfig(
name=task_name,
)
for task_name in task_list
]
def _info(self):
features = datasets.Features(
{
"question": datasets.Value("string"),
"A": datasets.Value("string"),
"B": datasets.Value("string"),
"C": datasets.Value("string"),
"D": datasets.Value("string"),
"answer": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
data_dir = dl_manager.download_and_extract(_URL)
task_name = self.config.name
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, f"test/{task_name}.csv"),
},
),
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(data_dir, f"dev/{task_name}.csv"),
},
),
]
def _generate_examples(self, filepath):
df = pd.read_csv(filepath, header=0, index_col=0, encoding="utf-8")
for i, instance in enumerate(df.to_dict(orient="records")):
yield i, instance

BIN
evaluation/cmmlu/cmmlu.zip Normal file

Binary file not shown.

View File

@@ -0,0 +1,270 @@
{
"agronomy": {
"name": "农学",
"category": "Other"
},
"anatomy": {
"name": "解剖学",
"category": "STEM"
},
"ancient_chinese": {
"name": "古汉语",
"category": "Social Sciences"
},
"arts": {
"name": "艺术学",
"category": "Humanities"
},
"astronomy": {
"name": "天文学",
"category": "STEM"
},
"business_ethics": {
"name": "商业伦理",
"category": "Social Sciences"
},
"chinese_civil_service_exam": {
"name": "中国公务员考试",
"category": "Social Sciences"
},
"chinese_driving_rule": {
"name": "中国驾驶规则",
"category": "Other"
},
"chinese_food_culture": {
"name": "中国饮食文化",
"category": "Social Sciences"
},
"chinese_foreign_policy": {
"name": "中国外交政策",
"category": "Social Sciences"
},
"chinese_history": {
"name": "中国历史",
"category": "Humanities"
},
"chinese_literature": {
"name": "中国文学",
"category": "Humanities"
},
"chinese_teacher_qualification": {
"name": "中国教师资格",
"category": "Social Sciences"
},
"college_actuarial_science": {
"name": "大学精算学",
"category": "STEM"
},
"college_education": {
"name": "大学教育学",
"category": "Social Sciences"
},
"college_engineering_hydrology": {
"name": "大学工程水文学",
"category": "STEM"
},
"college_law": {
"name": "大学法律",
"category": "Humanities"
},
"college_mathematics": {
"name": "大学数学",
"category": "STEM"
},
"college_medical_statistics": {
"name": "大学医学统计",
"category": "STEM"
},
"clinical_knowledge": {
"name": "临床知识",
"category": "Other"
},
"college_medicine": {
"name": "大学医学",
"category": "Other"
},
"computer_science": {
"name": "计算机科学",
"category": "STEM"
},
"computer_security": {
"name": "计算机安全",
"category": "Other"
},
"conceptual_physics": {
"name": "概念物理学",
"category": "STEM"
},
"construction_project_management": {
"name": "建设工程管理",
"category": "Other"
},
"economics": {
"name": "经济学",
"category": "Social Sciences"
},
"education": {
"name": "教育学",
"category": "Social Sciences"
},
"elementary_chinese": {
"name": "小学语文",
"category": "Social Sciences"
},
"elementary_commonsense": {
"name": "小学常识",
"category": "Other"
},
"elementary_information_and_technology": {
"name": "小学信息技术",
"category": "Other"
},
"electrical_engineering": {
"name": "电气工程",
"category": "STEM"
},
"elementary_mathematics": {
"name": "初等数学",
"category": "STEM"
},
"ethnology": {
"name": "民族学",
"category": "Social Sciences"
},
"food_science": {
"name": "食品科学",
"category": "Other"
},
"genetics": {
"name": "遗传学",
"category": "STEM"
},
"global_facts": {
"name": "全球事实",
"category": "Humanities"
},
"high_school_biology": {
"name": "高中生物",
"category": "STEM"
},
"high_school_chemistry": {
"name": "高中化学",
"category": "STEM"
},
"high_school_geography": {
"name": "高中地理",
"category": "Social Sciences"
},
"high_school_mathematics": {
"name": "高中数学",
"category": "STEM"
},
"high_school_physics": {
"name": "高中物理学",
"category": "STEM"
},
"high_school_politics": {
"name": "高中政治",
"category": "Social Sciences"
},
"human_sexuality": {
"name": "人类性行为",
"category": "Other"
},
"international_law": {
"name": "国际法学",
"category": "Humanities"
},
"journalism": {
"name": "新闻学",
"category": "Social Sciences"
},
"jurisprudence": {
"name": "法理学",
"category": "Humanities"
},
"legal_and_moral_basis": {
"name": "法律与道德基础",
"category": "Other"
},
"logical": {
"name": "逻辑学",
"category": "Humanities"
},
"machine_learning": {
"name": "机器学习",
"category": "STEM"
},
"management": {
"name": "管理学",
"category": "Social Sciences"
},
"marketing": {
"name": "市场营销",
"category": "Social Sciences"
},
"marxist_theory": {
"name": "马克思主义理论",
"category": "Humanities"
},
"modern_chinese": {
"name": "现代汉语",
"category": "Social Sciences"
},
"nutrition": {
"name": "营养学",
"category": "Other"
},
"philosophy": {
"name": "哲学",
"category": "Humanities"
},
"professional_accounting": {
"name": "专业会计",
"category": "Social Sciences"
},
"professional_law": {
"name": "专业法学",
"category": "Humanities"
},
"professional_medicine": {
"name": "专业医学",
"category": "Other"
},
"professional_psychology": {
"name": "专业心理学",
"category": "Social Sciences"
},
"public_relations": {
"name": "公共关系",
"category": "Social Sciences"
},
"security_study": {
"name": "安全研究",
"category": "Social Sciences"
},
"sociology": {
"name": "社会学",
"category": "Social Sciences"
},
"sports_science": {
"name": "体育学",
"category": "Other"
},
"traditional_chinese_medicine": {
"name": "中医中药",
"category": "Other"
},
"virology": {
"name": "病毒学",
"category": "STEM"
},
"world_history": {
"name": "世界历史",
"category": "Humanities"
},
"world_religions": {
"name": "世界宗教",
"category": "Humanities"
}
}