import pandas as pd # Load the dataset df = pd.read_csv("books.csv") # Generate question-answer pairs qa_pairs = [] for _, row in df.iterrows(): title, author, year, summary = row["title"], row["author"], row["year"], row["summary"] # Add variations of questions and answers qa_pairs.extend([ {"question": f"Who wrote '{title}'?", "answer": author}, {"question": f"Can you tell me the author of {title}?", "answer": author}, {"question": f"Who is the author of {title}?", "answer": author}, {"question": f"Who is the writer of {title}?", "answer": author}, {"question": f"Please tell me the author of '{title}'.", "answer": author}, {"question": f"Who is the person behind '{title}'?", "answer": author} ]) # Variants of questions about the publication year qa_pairs.extend([ {"question": f"What year was {title} published?", "answer": year}, {"question": f"When was {title} written?", "answer": year}, {"question": f"In which year was {title} published?", "answer": year}, {"question": f"Can you tell me when {title} was released?", "answer": year}, {"question": f"Please tell me the year of {title}.", "answer": year}, {"question": f"What is the publication year of {title}?", "answer": year} ]) # Variants of questions about the summary qa_pairs.extend([ {"question": f"What is '{title}' about?", "answer": summary}, {"question": f"Can you summarize {title}?", "answer": summary}, {"question": f"Tell me the plot of {title}.", "answer": summary}, {"question": f"Give me a summary of {title}.", "answer": summary}, {"question": f"What is the main idea of {title}?", "answer": summary}, {"question": f"Explain what '{title}' is about.", "answer": summary} ]) # Combined questions about author and year qa_pairs.extend([ {"question": f"Who is the author and what is the year of '{title}'?", "answer": f"The author of {title} is {author} and it was published in {year}."}, {"question": f"What is the year and summary of {title}?", "answer": f"{title} was published in {year}. {summary}"}, {"question": f"Can you tell me the author and plot of {title}?", "answer": f"The author of {title} is {author}. The plot revolves around {summary}."} ]) # Save pairs to a CSV file qa_df = pd.DataFrame(qa_pairs, columns=["question", "answer"]) qa_df.to_csv("qa_dataset.csv", index=False) print(f"Generated {len(qa_pairs)} question-answer pairs.")