File size: 1,112 Bytes
2904d0e
7c398ad
2904d0e
 
 
7c398ad
2904d0e
 
 
 
7c398ad
2904d0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c398ad
2904d0e
 
 
 
 
 
 
 
 
 
 
7c398ad
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
labels = ["CV", "AI", "ML", "NE", "CL"]

id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

category2human = {
    "CV": "Computer Vision",
    "AI": "Artificial Intelligence",
    "ML": "Machine Learning",
    "NE": "Neural and Evolutionary Computing",
    "CL": "Computation and Language",
}


def load_arxiv_dataset():
    import kagglehub
    import os
    from datasets import load_dataset

    # Download latest version
    path = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts")

    dataset = load_dataset(
        "csv",
        data_files=os.path.join(path, "arxiv_data.csv"),
        encoding="utf-8",
        split="train",
    )

    # convert string to lists
    import ast

    def parse_terms(example):
        example["terms"] = ast.literal_eval(example["terms"])
        return example

    dataset = dataset.map(parse_terms)

    return dataset


def create_prompt(title, summary):
    """
    Create a prompt for the model from the title and summary.
    """
    return f"# title:\n{title}\n# abstract:\n{summary}"