Spaces:
No application file
No application file
from datasets import Dataset | |
from typing import Dict | |
def format_prompt(example: Dict[str, str]) -> Dict[str, str]: | |
if all(k in example for k in ("instruction", "output")): | |
instruction = example["instruction"] | |
input_text = example.get("input", "") | |
prompt = f"### Instruction:\n{instruction}\n\n" | |
if input_text.strip(): | |
prompt += f"### Input:\n{input_text}\n\n" | |
prompt += f"### Response:\n" | |
return {"prompt": prompt, "completion": example["output"]} | |
elif all(k in example for k in ("prompt", "completion")): | |
return {"prompt": example["prompt"], "completion": example["completion"]} | |
else: | |
raise ValueError(f"μ§μνμ§ μλ λ°μ΄ν° νμ: {example}") | |
def preprocess(dataset): | |
# λ°μ΄ν°μ μ΄ νμΈ | |
column_names = dataset.column_names | |
if all(k in column_names for k in ("prompt", "completion")): | |
return dataset # κ·Έλλ‘ μ¬μ© | |
elif all(k in column_names for k in ("instruction", "output")): | |
return dataset.map(format_prompt, remove_columns=column_names) | |
else: | |
raise ValueError(f"μ§μνμ§ μλ μ΄ κ΅¬μ±: {column_names}") | |
""" | |
# μΆλ ₯ νμΈ | |
print(processed_dataset[0]) # input_ids , attention_mask , labels | |
print("111") | |
print(tokenized_dataset[0]) | |
""" |