mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2025-12-16 20:00:36 +08:00
@@ -93,11 +93,13 @@ def get_dataset(
|
||||
dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name)
|
||||
|
||||
if dataset_attr.source_prefix: # add prefix
|
||||
features = None
|
||||
if data_args.streaming:
|
||||
features = dataset.features
|
||||
features["prefix"] = Value(dtype="string", id=None)
|
||||
dataset = dataset.map(lambda _: {"prefix": dataset_attr.source_prefix}, features=features)
|
||||
dataset = dataset.map(lambda _: {"prefix": dataset_attr.source_prefix}, features=features)
|
||||
else:
|
||||
prefix_data = [dataset_attr.source_prefix] * len(dataset)
|
||||
dataset = dataset.add_column("prefix", prefix_data)
|
||||
|
||||
all_datasets.append(dataset)
|
||||
|
||||
|
||||
@@ -19,7 +19,8 @@ def split_dataset(
|
||||
dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
|
||||
return {"train_dataset": train_set, "eval_dataset": val_set}
|
||||
else:
|
||||
dataset = dataset.train_test_split(test_size=data_args.val_size, seed=training_args.seed)
|
||||
val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
|
||||
dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed)
|
||||
return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
|
||||
else:
|
||||
if data_args.streaming:
|
||||
|
||||
Reference in New Issue
Block a user