labels = ["CV", "AI", "ML", "NE", "CL"] id2label = {i: label for i, label in enumerate(labels)} label2id = {label: i for i, label in enumerate(labels)} category2human = { "CV": "Computer Vision", "AI": "Artificial Intelligence", "ML": "Machine Learning", "NE": "Neural and Evolutionary Computing", "CL": "Computation and Language", } def load_arxiv_dataset(): import kagglehub import os from datasets import load_dataset # Download latest version path = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts") dataset = load_dataset( "csv", data_files=os.path.join(path, "arxiv_data.csv"), encoding="utf-8", split="train", ) # convert string to lists import ast def parse_terms(example): example["terms"] = ast.literal_eval(example["terms"]) return example dataset = dataset.map(parse_terms) return dataset def create_prompt(title, summary): """ Create a prompt for the model from the title and summary. """ return f"# title:\n{title}\n# abstract:\n{summary}"