File size: 2,209 Bytes
506a510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from config import (
    DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
    TFIDF_MAX_FEATURES, NGRAM_RANGE, USE_STOPWORDS,
    RANDOM_STATE, TEST_SIZE,
    MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TFIDF_VECTORIZER_PATH
)

def load_data(path):
    df = pd.read_csv(path)
    df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
    return df

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def train():
    print(" Loading data...")
    df = load_data(DATA_PATH)
    X = df[TEXT_COLUMN]
    
    print(" Fitting TF-IDF...")
    stop_words = 'english' if USE_STOPWORDS else None
    tfidf = TfidfVectorizer(
        max_features=TFIDF_MAX_FEATURES,
        ngram_range=NGRAM_RANGE,
        stop_words=stop_words
    )
    X_tfidf = tfidf.fit_transform(X)

    print(f" Saving TF-IDF vectorizer to {TFIDF_VECTORIZER_PATH}")
    save_pickle(tfidf, TFIDF_VECTORIZER_PATH)

    models = {}
    label_encoders = {}

    for label in LABEL_COLUMNS:
        print(f"\n Processing label: {label}")
        le = LabelEncoder()
        y = le.fit_transform(df[label])

        print(" Splitting train/test...")
        X_train, X_test, y_train, y_test = train_test_split(
            X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
        )

        print(" Training XGBoost model...")
        model = xgb.XGBClassifier(
            use_label_encoder=False,
            eval_metric="mlogloss",
            random_state=RANDOM_STATE
        )
        model.fit(X_train, y_train)

        models[label] = model
        label_encoders[label] = le
        print(f" Finished training for: {label}")

    models_path = os.path.join(MODEL_SAVE_DIR, "xgb_models.pkl")
    print(f"\n Saving all models to: {models_path}")
    save_pickle(models, models_path)

    print(f" Saving label encoders to: {LABEL_ENCODERS_PATH}")
    save_pickle(label_encoders, LABEL_ENCODERS_PATH)

    print("\n Training complete.")

if __name__ == "__main__":
    train()