fix streaming in pt stage #548 #549

This commit is contained in:
hiyouga
2023-08-17 17:59:26 +08:00
parent ff0aa793b6
commit b0ed0dec5e
4 changed files with 43 additions and 40 deletions

View File

@@ -1,4 +1,7 @@
from typing import TYPE_CHECKING, Dict, Union
import hashlib
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from llmtuner.extras.logging import get_logger
if TYPE_CHECKING:
from datasets import Dataset, IterableDataset
@@ -6,6 +9,32 @@ if TYPE_CHECKING:
from llmtuner.hparams import DataArguments
logger = get_logger(__name__)
EXT2TYPE = {
"csv": "csv",
"json": "json",
"jsonl": "json",
"txt": "text"
}
def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
if file_sha1 is None:
logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.")
return
if len(data_files) != 1:
logger.warning("Checksum failed: too many files.")
return
with open(data_files[0], "rb") as f:
sha1 = hashlib.sha1(f.read()).hexdigest()
if sha1 != file_sha1:
logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0]))
def split_dataset(
dataset: Union["Dataset", "IterableDataset"],
data_args: "DataArguments",