import os import pandas as pd import joblib from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.multioutput import MultiOutputClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder from config import ( DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS, MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, TFIDF_MAX_FEATURES, NGRAM_RANGE, USE_STOPWORDS, RANDOM_STATE, TEST_SIZE ) # Ensure required directories are created os.makedirs(MODEL_SAVE_DIR, exist_ok=True) print(" Loading dataset...") df = pd.read_csv(DATA_PATH) df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True) # Encode each target label label_encoders = {} for col in LABEL_COLUMNS: le = LabelEncoder() df[col] = le.fit_transform(df[col]) label_encoders[col] = le X = df[TEXT_COLUMN] Y = df[LABEL_COLUMNS] print("✂️ Splitting train/test...") X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE ) print(" Building pipeline...") stop_words = "english" if USE_STOPWORDS else None pipeline = Pipeline([ ('tfidf', TfidfVectorizer( max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words=stop_words )), ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))) ]) print(" Training model...") pipeline.fit(X_train, y_train) # Save full pipeline model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl") print(f" Saving model to {model_path}") joblib.dump(pipeline, model_path) # Save label encoders print(f" Saving label encoders to {LABEL_ENCODERS_PATH}") joblib.dump(label_encoders, LABEL_ENCODERS_PATH) # Save TF-IDF separately (optional) tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl") print(f" Saving TF-IDF vectorizer to {tfidf_path}") joblib.dump(pipeline.named_steps["tfidf"], tfidf_path) print(" Training complete.")