import pandas as pd import pickle import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import xgboost as xgb from config import ( DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, NGRAM_RANGE, USE_STOPWORDS, RANDOM_STATE, TEST_SIZE, MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TFIDF_VECTORIZER_PATH ) def load_data(path): df = pd.read_csv(path) df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True) return df def save_pickle(obj, path): with open(path, "wb") as f: pickle.dump(obj, f) def train(): print(" Loading data...") df = load_data(DATA_PATH) X = df[TEXT_COLUMN] print(" Fitting TF-IDF...") stop_words = 'english' if USE_STOPWORDS else None tfidf = TfidfVectorizer( max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words ) X_tfidf = tfidf.fit_transform(X) print(f" Saving TF-IDF vectorizer to {TFIDF_VECTORIZER_PATH}") save_pickle(tfidf, TFIDF_VECTORIZER_PATH) models = {} label_encoders = {} for label in LABEL_COLUMNS: print(f"\n Processing label: {label}") le = LabelEncoder() y = le.fit_transform(df[label]) print(" Splitting train/test...") X_train, X_test, y_train, y_test = train_test_split( X_tfidf, y, test_size=TEST_SIZE, random_state=RANDOM_STATE ) print(" Training XGBoost model...") model = xgb.XGBClassifier( use_label_encoder=False, eval_metric="mlogloss", random_state=RANDOM_STATE ) model.fit(X_train, y_train) models[label] = model label_encoders[label] = le print(f" Finished training for: {label}") models_path = os.path.join(MODEL_SAVE_DIR, "xgb_models.pkl") print(f"\n Saving all models to: {models_path}") save_pickle(models, models_path) print(f" Saving label encoders to: {LABEL_ENCODERS_PATH}") save_pickle(label_encoders, LABEL_ENCODERS_PATH) print("\n Training complete.") if __name__ == "__main__": train()