| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import argparse |
| |
|
| | import numpy as np |
| | import pandas as pd |
| |
|
| |
|
| | def construct_negatives(input_file, output_file, num_passages, num_negatives): |
| | qrels = pd.read_csv(input_file, delimiter="\t", header=None) |
| | with open(output_file, "w") as f: |
| | for i in range(len(qrels)): |
| | query_id, rel_passage_id = qrels[0][i], qrels[2][i] |
| | negatives = np.random.randint(num_passages, size=num_negatives) |
| | output_ids = [query_id, rel_passage_id] + negatives.tolist() |
| | output_str = [str(id_) for id_ in output_ids] |
| | print("\t".join(output_str), file=f) |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Negative passages construction") |
| | parser.add_argument("--data", type=str, default="msmarco_dataset", help="path to folder with data") |
| | parser.add_argument("--num_passages", type=int, default=8841823, help="total number of passages") |
| | parser.add_argument("--num_negatives", type=int, default=10, help="number of negatives per positive") |
| | args = parser.parse_args() |
| |
|
| | for mode in ["train", "dev"]: |
| | construct_negatives( |
| | input_file=f"{args.data}/qrels.{mode}.tsv", |
| | output_file=f"{args.data}/query2passages.{mode}.tsv", |
| | num_passages=args.num_passages, |
| | num_negatives=args.num_negatives, |
| | ) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|