use pre-commit

Former-commit-id: 21db8ed2f4
2026-03-04 18:56:01 +08:00 · 2024-10-29 09:07:46 +00:00
parent 163cf2ba5c
commit 0d8aa6e6ef
86 changed files with 1048 additions and 1064 deletions
--- a/data/alpaca_en_demo.json
+++ b/data/alpaca_en_demo.json
@@ -4999,4 +4999,4 @@
    "input": "Time waits for no one.",
    "output": "No one can stop time from moving forward."
  }
-]
+]
--- a/data/alpaca_zh_demo.json
+++ b/data/alpaca_zh_demo.json
@@ -4999,4 +4999,4 @@
    "input": "",
    "output": "安第斯山脉位于南美洲，横跨七个国家，包括委内瑞拉，哥伦比亚，厄瓜多尔，秘鲁，玻利维亚，智利和阿根廷。安第斯山脉是世界上最长的山脉之一，全长约7,000千米（4,350英里），其山脉沿着南美洲西海岸蜿蜒延伸，平均海拔约为4,000米（13,000英尺）。在其南部，安第斯山脉宽度达到700千米（430英里），在其北部宽度约为500千米（310英里）。"
  }
-]
+]
--- a/data/belle_multiturn/belle_multiturn.py
+++ b/data/belle_multiturn/belle_multiturn.py
@@ -17,9 +17,9 @@ _CITATION = """\
 }
 """

-_HOMEPAGE = "{}/datasets/BelleGroup/multiturn_chat_0.8M".format(_HF_ENDPOINT)
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
 _LICENSE = "gpl-3.0"
-_URL = "{}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json".format(_HF_ENDPOINT)
+_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"


 class BelleMultiturn(datasets.GeneratorBasedBuilder):
@@ -38,7 +38,7 @@ class BelleMultiturn(datasets.GeneratorBasedBuilder):
        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]

    def _generate_examples(self, filepath: str):
-        with open(filepath, "r", encoding="utf-8") as f:
+        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                conversations = []
--- a/data/c4_demo.json
+++ b/data/c4_demo.json
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -625,4 +625,4 @@
    },
    "folder": "python"
  }
-}
+}
--- a/data/dpo_en_demo.json
+++ b/data/dpo_en_demo.json
@@ -7223,4 +7223,4 @@
      "value": "Abstraction is a principle in object-oriented programming that refers to the process of focusing on the essential features of an object or concept without emphasizing the details that are not important for its functionality. It enables programmers to create classes that represent the objects they want to work with, and specifies only the behavior of those objects, leaving out unnecessary details. Abstraction helps to make the code more maintainable, modular, and scalable. It also improves the productivity of developers by reducing the amount of code they need to write."
    }
  }
-]
+]
--- a/data/dpo_zh_demo.json
+++ b/data/dpo_zh_demo.json
@@ -5055,4 +5055,4 @@
      "value": "C. 参与讨论"
    }
  }
-]
+]
--- a/data/glaive_toolcall_en_demo.json
+++ b/data/glaive_toolcall_en_demo.json
@@ -9155,4 +9155,4 @@
    ],
    "tools": "[]"
  }
-]
+]
--- a/data/glaive_toolcall_zh_demo.json
+++ b/data/glaive_toolcall_zh_demo.json
@@ -9019,4 +9019,4 @@
    ],
    "tools": "[]"
  }
-]
+]
--- a/data/hh_rlhf_en/hh_rlhf_en.py
+++ b/data/hh_rlhf_en/hh_rlhf_en.py
@@ -8,9 +8,9 @@ import datasets
 _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
 _CITATION = ""
-_HOMEPAGE = "{}/datasets/Anthropic/hh-rlhf".format(_HF_ENDPOINT)
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
 _LICENSE = "mit"
-_URL = "{}/datasets/Anthropic/hh-rlhf/resolve/main/".format(_HF_ENDPOINT)
+_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
 _URLS = {
    "train": [
        _URL + "harmless-base/train.jsonl.gz",
@@ -53,7 +53,7 @@ class HhRlhfEn(datasets.GeneratorBasedBuilder):
    def _generate_examples(self, filepaths: List[str]):
        key = 0
        for filepath in filepaths:
-            with open(filepath, "r", encoding="utf-8") as f:
+            with open(filepath, encoding="utf-8") as f:
                for row in f:
                    data = json.loads(row)
                    chosen = data["chosen"]
--- a/data/identity.json
+++ b/data/identity.json
@@ -454,4 +454,4 @@
    "input": "",
    "output": "抱歉，我不是 OpenAI 开发的 ChatGPT，我是 {{author}} 开发的 {{name}}，旨在为用户提供智能化的回答和帮助。"
  }
-]
+]
--- a/data/kto_en_demo.json
+++ b/data/kto_en_demo.json
@@ -5395,4 +5395,4 @@
    ],
    "label": false
  }
-]
+]
--- a/data/mllm_demo.json
+++ b/data/mllm_demo.json
@@ -137,4 +137,4 @@
      "mllm_demo_data/3.jpg"
    ]
  }
-]
+]
--- a/data/mllm_video_demo.json
+++ b/data/mllm_video_demo.json
@@ -44,4 +44,4 @@
      "mllm_demo_data/3.mp4"
    ]
  }
-]
+]
--- a/data/ultra_chat/ultra_chat.py
+++ b/data/ultra_chat/ultra_chat.py
@@ -20,9 +20,9 @@ _CITATION = """\
 }
 """

-_HOMEPAGE = "{}/datasets/stingning/ultrachat".format(_HF_ENDPOINT)
+_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
 _LICENSE = "cc-by-nc-4.0"
-_BASE_DATA_URL = "{}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl".format(_HF_ENDPOINT)
+_BASE_DATA_URL = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{{idx}}.jsonl"


 class UltraChat(datasets.GeneratorBasedBuilder):
@@ -42,7 +42,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):

    def _generate_examples(self, filepaths: List[str]):
        for filepath in filepaths:
-            with open(filepath, "r", encoding="utf-8") as f:
+            with open(filepath, encoding="utf-8") as f:
                for row in f:
                    try:
                        data = json.loads(row)
--- a/data/wiki_demo.txt
+++ b/data/wiki_demo.txt