mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-07-31 10:42:50 +08:00
164 lines
4.9 KiB
Python
164 lines
4.9 KiB
Python
# Copyright 2025 the LlamaFactory team.
|
|
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
|
|
import datasets
|
|
import pandas as pd
|
|
|
|
|
|
_CITATION = """\
|
|
@article{huang2023ceval,
|
|
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
|
|
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and others},
|
|
journal={arXiv preprint arXiv:2305.08322},
|
|
year={2023}
|
|
}
|
|
"""
|
|
|
|
_DESCRIPTION = """\
|
|
C-Eval is a comprehensive Chinese evaluation suite for foundation models.
|
|
It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
|
|
"""
|
|
|
|
_HOMEPAGE = "https://cevalbenchmark.com"
|
|
|
|
_LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License"
|
|
|
|
_URL = "ceval.zip"
|
|
|
|
task_list = [
|
|
"computer_network",
|
|
"operating_system",
|
|
"computer_architecture",
|
|
"college_programming",
|
|
"college_physics",
|
|
"college_chemistry",
|
|
"advanced_mathematics",
|
|
"probability_and_statistics",
|
|
"discrete_mathematics",
|
|
"electrical_engineer",
|
|
"metrology_engineer",
|
|
"high_school_mathematics",
|
|
"high_school_physics",
|
|
"high_school_chemistry",
|
|
"high_school_biology",
|
|
"middle_school_mathematics",
|
|
"middle_school_biology",
|
|
"middle_school_physics",
|
|
"middle_school_chemistry",
|
|
"veterinary_medicine",
|
|
"college_economics",
|
|
"business_administration",
|
|
"marxism",
|
|
"mao_zedong_thought",
|
|
"education_science",
|
|
"teacher_qualification",
|
|
"high_school_politics",
|
|
"high_school_geography",
|
|
"middle_school_politics",
|
|
"middle_school_geography",
|
|
"modern_chinese_history",
|
|
"ideological_and_moral_cultivation",
|
|
"logic",
|
|
"law",
|
|
"chinese_language_and_literature",
|
|
"art_studies",
|
|
"professional_tour_guide",
|
|
"legal_professional",
|
|
"high_school_chinese",
|
|
"high_school_history",
|
|
"middle_school_history",
|
|
"civil_servant",
|
|
"sports_science",
|
|
"plant_protection",
|
|
"basic_medicine",
|
|
"clinical_medicine",
|
|
"urban_and_rural_planner",
|
|
"accountant",
|
|
"fire_engineer",
|
|
"environmental_impact_assessment_engineer",
|
|
"tax_accountant",
|
|
"physician",
|
|
]
|
|
|
|
|
|
class CevalConfig(datasets.BuilderConfig):
|
|
def __init__(self, **kwargs):
|
|
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
|
|
|
|
|
|
class Ceval(datasets.GeneratorBasedBuilder):
|
|
BUILDER_CONFIGS = [
|
|
CevalConfig(
|
|
name=task_name,
|
|
)
|
|
for task_name in task_list
|
|
]
|
|
|
|
def _info(self):
|
|
features = datasets.Features(
|
|
{
|
|
"id": datasets.Value("int32"),
|
|
"question": datasets.Value("string"),
|
|
"A": datasets.Value("string"),
|
|
"B": datasets.Value("string"),
|
|
"C": datasets.Value("string"),
|
|
"D": datasets.Value("string"),
|
|
"answer": datasets.Value("string"),
|
|
"explanation": datasets.Value("string"),
|
|
}
|
|
)
|
|
return datasets.DatasetInfo(
|
|
description=_DESCRIPTION,
|
|
features=features,
|
|
homepage=_HOMEPAGE,
|
|
license=_LICENSE,
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager):
|
|
data_dir = dl_manager.download_and_extract(_URL)
|
|
task_name = self.config.name
|
|
return [
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TEST,
|
|
gen_kwargs={
|
|
"filepath": os.path.join(data_dir, "test", f"{task_name}_test.csv"),
|
|
},
|
|
),
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.VALIDATION,
|
|
gen_kwargs={
|
|
"filepath": os.path.join(data_dir, "val", f"{task_name}_val.csv"),
|
|
},
|
|
),
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TRAIN,
|
|
gen_kwargs={
|
|
"filepath": os.path.join(data_dir, "dev", f"{task_name}_dev.csv"),
|
|
},
|
|
),
|
|
]
|
|
|
|
def _generate_examples(self, filepath):
|
|
df = pd.read_csv(filepath, encoding="utf-8")
|
|
for i, instance in enumerate(df.to_dict(orient="records")):
|
|
if "answer" not in instance.keys():
|
|
instance["answer"] = ""
|
|
if "explanation" not in instance.keys():
|
|
instance["explanation"] = ""
|
|
yield i, instance
|