Spaces:
Sleeping
Sleeping
import pandas as pd | |
# Load the dataset | |
df = pd.read_csv("books.csv") | |
# Generate question-answer pairs | |
qa_pairs = [] | |
for _, row in df.iterrows(): | |
title, author, year, summary = row["title"], row["author"], row["year"], row["summary"] | |
# Add variations of questions and answers | |
qa_pairs.extend([ | |
{"question": f"Who wrote '{title}'?", "answer": author}, | |
{"question": f"Can you tell me the author of {title}?", "answer": author}, | |
{"question": f"Who is the author of {title}?", "answer": author}, | |
{"question": f"Who is the writer of {title}?", "answer": author}, | |
{"question": f"Please tell me the author of '{title}'.", "answer": author}, | |
{"question": f"Who is the person behind '{title}'?", "answer": author} | |
]) | |
# Variants of questions about the publication year | |
qa_pairs.extend([ | |
{"question": f"What year was {title} published?", "answer": year}, | |
{"question": f"When was {title} written?", "answer": year}, | |
{"question": f"In which year was {title} published?", "answer": year}, | |
{"question": f"Can you tell me when {title} was released?", "answer": year}, | |
{"question": f"Please tell me the year of {title}.", "answer": year}, | |
{"question": f"What is the publication year of {title}?", "answer": year} | |
]) | |
# Variants of questions about the summary | |
qa_pairs.extend([ | |
{"question": f"What is '{title}' about?", "answer": summary}, | |
{"question": f"Can you summarize {title}?", "answer": summary}, | |
{"question": f"Tell me the plot of {title}.", "answer": summary}, | |
{"question": f"Give me a summary of {title}.", "answer": summary}, | |
{"question": f"What is the main idea of {title}?", "answer": summary}, | |
{"question": f"Explain what '{title}' is about.", "answer": summary} | |
]) | |
# Combined questions about author and year | |
qa_pairs.extend([ | |
{"question": f"Who is the author and what is the year of '{title}'?", "answer": f"The author of {title} is {author} and it was published in {year}."}, | |
{"question": f"What is the year and summary of {title}?", "answer": f"{title} was published in {year}. {summary}"}, | |
{"question": f"Can you tell me the author and plot of {title}?", "answer": f"The author of {title} is {author}. The plot revolves around {summary}."} | |
]) | |
# Save pairs to a CSV file | |
qa_df = pd.DataFrame(qa_pairs, columns=["question", "answer"]) | |
qa_df.to_csv("qa_dataset.csv", index=False) | |
print(f"Generated {len(qa_pairs)} question-answer pairs.") | |