Spaces:
No application file
No application file
from datasets import load_dataset | |
# Load WikiLingua dataset | |
dataset = load_dataset("GEM/wiki_lingua", "en") # English articles -> English summaries | |
dataset_fr = load_dataset("GEM/wiki_lingua", "fr") # French articles -> French summaries | |
dataset_cross = load_dataset("GEM/wiki_lingua", "fr_en") # French articles -> English summaries | |
train = dataset["train"] | |
val = dataset["validation"] | |
test = dataset["test"] | |
train_fr = dataset_fr["train"] | |
val_fr = dataset_fr["validation"] | |
test_fr = dataset_fr["test"] | |
train_cross = dataset_cross["train"] | |
val_cross = dataset_cross["validation"] | |
test_cross = dataset_cross["test"] | |
train_cross = train_cross.filter(lambda example: example["source_language"] == "fr").filter(lambda example: example["target_language"] == "en") | |
val_cross = val_cross.filter(lambda example: example["source_language"] == "fr").filter(lambda example: example["target_language"] == "en") | |
test_cross = test_cross.filter(lambda example: example["source_language"] == "fr").filter(lambda example: example["target_language"] == "en") | |
# Sample only articles with <= 512 tokens | |
max_length = 512 | |
def dataset_sample(dataset): | |
return dataset.filter(lambda example: len(example["source"]) <= max_length) | |
train = dataset_sample(train) | |
val = dataset_sample(val) | |
test = dataset_sample(test) | |
train_fr = dataset_sample(train_fr) | |
val_fr = dataset_sample(val_fr) | |
test_fr = dataset_sample(test_fr) | |
train_cross = dataset_sample(train_cross) | |
val_cross = dataset_sample(val_cross) | |
test_cross = dataset_sample(test_cross) | |
# To csv files | |
train.to_csv("train.csv") | |
val.to_csv("val.csv") | |
test.to_csv("test.csv") | |
train_fr.to_csv("train_fr.csv") | |
val_fr.to_csv("val_fr.csv") | |
test_fr.to_csv("test_fr.csv") | |
train_cross.to_csv("train_cross.csv") | |
val_cross.to_csv("val_cross.csv") | |
test_cross.to_csv("test_cross.csv") |