File size: 2,632 Bytes
5fc6e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from pathlib import Path

from dotenv import load_dotenv
from loguru import logger

# Load environment variables from .env file if it exists
load_dotenv()

# Paths
PROJ_ROOT = Path(__file__).resolve().parents[1]
logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")

DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
EXTERNAL_DATA_DIR = DATA_DIR / "external"

MODELS_DIR = PROJ_ROOT / "models"

REPORTS_DIR = PROJ_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

# Dataset
DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification"
LANGS = ["java", "python", "pharo"]
INPUT_COLUMN = "combo"
LABEL_COLUMN = "labels"

LABELS_MAP = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": [
        "Keyimplementationpoints",
        "Example",
        "Responsibilities",
        "Intent",
        "Keymessages",
        "Collaborators",
    ],
}

TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values())

# Score parameters
MAX_AVG_RUNTIME = 5.0  # seconds
MAX_AVG_FLOPS = 5000.0  # GFLOPS

# Training parameters
DEFAULT_BATCH_SIZE = 32

# Model configuration mapping
MODEL_CONFIG = {
    "codeberta": {
        "model_name": "fine-tuned-CodeBERTa",
        "exp_name": "fine-tuned-CodeBERTa",
        "model_class_module": "turing.modeling.models.codeBerta",
        "model_class_name": "CodeBERTa",
    },
    "graphcodebert": {
        "model_name": "GraphCodeBERT",
        "exp_name": "fine-tuned-GraphCodeBERT",
        "model_class_module": "turing.modeling.models.graphCodeBert",
        "model_class_name": "GraphCodeBERTClassifier",
    },
    "tinybert": {
        "model_name": "TinyBERT",
        "exp_name": "fine-tuned-TinyBERT",
        "model_class_module": "turing.modeling.models.tinyBert",
        "model_class_name": "TinyBERTClassifier",
    },
    "randomforest": {
        "model_name": "RandomForest-TfIdf",
        "exp_name": "RandomForest-TfIdf",
        "model_class_module": "turing.modeling.models.randomForestTfIdf",
        "model_class_name": "RandomForestTfIdf",
    },
}
DEFAULT_NUM_ITERATIONS = 20

# Existing model modules
EXISTING_MODELS = [
    "randomForestTfIdf",
    "codeBerta",
]

# If tqdm is installed, configure loguru with tqdm.write
# https://github.com/Delgan/loguru/issues/135
try:
    from tqdm import tqdm

    logger.remove(0)
    logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
except (ModuleNotFoundError, ValueError):
    pass