mirror of
				https://github.com/hiyouga/LLaMA-Factory.git
				synced 2025-11-04 18:02:19 +08:00 
			
		
		
		
	Follow HF_ENDPOINT environment variable
Former-commit-id: 22b36a3cfd2909cb624b1bb7385558eda504defe
This commit is contained in:
		
							parent
							
								
									e93c7cdb80
								
							
						
					
					
						commit
						6fc2d7e063
					
				@ -1,6 +1,8 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import datasets
 | 
			
		||||
 | 
			
		||||
_HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 | 
			
		||||
 | 
			
		||||
_DESCRIPTION = "BELLE multiturn chat dataset."
 | 
			
		||||
 | 
			
		||||
@ -13,9 +15,9 @@ _CITATION = """\
 | 
			
		||||
}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
_HOMEPAGE = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M"
 | 
			
		||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M"
 | 
			
		||||
_LICENSE = "gpl-3.0"
 | 
			
		||||
_URL = "https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
 | 
			
		||||
_URL = f"{_HF_ENDPOINT}/datasets/BelleGroup/multiturn_chat_0.8M/resolve/main/multiturn_chat_0.8M.json"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BelleMultiturn(datasets.GeneratorBasedBuilder):
 | 
			
		||||
 | 
			
		||||
@ -1,13 +1,17 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import datasets
 | 
			
		||||
from typing import List
 | 
			
		||||
 | 
			
		||||
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
 | 
			
		||||
 | 
			
		||||
_DESCRIPTION = "Human preference data about helpfulness and harmlessness."
 | 
			
		||||
 | 
			
		||||
_CITATION = ""
 | 
			
		||||
_HOMEPAGE = "https://huggingface.co/datasets/Anthropic/hh-rlhf"
 | 
			
		||||
 | 
			
		||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf"
 | 
			
		||||
_LICENSE = "mit"
 | 
			
		||||
_URL = "https://huggingface.co/datasets/Anthropic/hh-rlhf/resolve/main/"
 | 
			
		||||
_URL = f"{_HF_ENDPOINT}/datasets/Anthropic/hh-rlhf/resolve/main/"
 | 
			
		||||
_URLS = {
 | 
			
		||||
    "train": [
 | 
			
		||||
        _URL + "harmless-base/train.jsonl.gz",
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,9 @@
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import datasets
 | 
			
		||||
from typing import List
 | 
			
		||||
 | 
			
		||||
_HF_ENDPOINT = os.getenv("_HF_ENDPOINT", "https://huggingface.co")
 | 
			
		||||
 | 
			
		||||
_DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
 | 
			
		||||
 | 
			
		||||
@ -16,9 +18,9 @@ _CITATION = """\
 | 
			
		||||
}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
_HOMEPAGE = "https://huggingface.co/datasets/stingning/ultrachat"
 | 
			
		||||
_HOMEPAGE = f"{_HF_ENDPOINT}/datasets/stingning/ultrachat"
 | 
			
		||||
_LICENSE = "cc-by-nc-4.0"
 | 
			
		||||
_BASE_DATA_URL = "https://huggingface.co/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
 | 
			
		||||
_BASE_DATA_URL = "{_HF_ENDPOINT}/datasets/stingning/ultrachat/resolve/main/train_{idx}.jsonl"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UltraChat(datasets.GeneratorBasedBuilder):
 | 
			
		||||
@ -38,7 +40,7 @@ class UltraChat(datasets.GeneratorBasedBuilder):
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def _split_generators(self, dl_manager: datasets.DownloadManager):
 | 
			
		||||
        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
 | 
			
		||||
        file_paths = [dl_manager.download(_BASE_DATA_URL.format(_HF_ENDPOINT=_HF_ENDPOINT,idx=idx)) for idx in range(10)] # multiple shards
 | 
			
		||||
        return [
 | 
			
		||||
            datasets.SplitGenerator(
 | 
			
		||||
                name=datasets.Split.TRAIN,
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user