# tfidf_based_models/tfidf_lgbm.py from sklearn.feature_extraction.text import TfidfVectorizer import lightgbm as lgb from sklearn.pipeline import Pipeline from sklearn.metrics import classification_report from sklearn.preprocessing import LabelEncoder from sklearn.utils.class_weight import compute_class_weight import numpy as np import pandas as pd import joblib import os from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR class TfidfLightGBM: """ TF-IDF based LightGBM model for multi-output classification. It trains a separate LightGBM classifier for each target label after converting text data into TF-IDF features. """ def __init__(self, label_encoders): """ Initializes the TfidfLightGBM model. Args: label_encoders (dict): A dictionary of LabelEncoder objects. """ self.label_encoders = label_encoders self.models = {} # Stores the trained Pipeline for each label def train(self, X_train_text, y_train_df): """ Trains a TF-IDF + LightGBM pipeline for each label. Args: X_train_text (pd.Series): Training text data. y_train_df (pd.DataFrame): DataFrame of training labels (encoded). """ print("Training TF-IDF + LightGBM models...") for i, col in enumerate(LABEL_COLUMNS): print(f" Training for {col}...") num_classes = len(self.label_encoders[col].classes_) # Determine LightGBM objective based on number of classes objective = 'multiclass' if num_classes > 2 else 'binary' # `num_class` parameter is required for 'multiclass' objective num_class_param = {'num_class': num_classes} if num_classes > 2 else {} pipeline = Pipeline([ ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)), ('lgbm', lgb.LGBMClassifier( objective=objective, **num_class_param, # Unpack num_class_param if it's not empty random_state=42, n_estimators=100 )) ]) # Fit the pipeline on the training data. # LightGBM handles class imbalance with `is_unbalance=True` or `scale_pos_weight` # for binary classification, or implicitly for multiclass with default settings. pipeline.fit(X_train_text, y_train_df[col]) self.models[col] = pipeline print("TF-IDF + LightGBM training complete.") def predict(self, X_test_text): """ Makes class predictions for all labels. Args: X_test_text (pd.Series): Test text data. Returns: dict: A dictionary where keys are label names and values are NumPy arrays of predicted class indices. """ predictions = {} for col, model_pipeline in self.models.items(): predictions[col] = model_pipeline.predict(X_test_text) return predictions def predict_proba(self, X_test_text): """ Returns prediction probabilities for each class for all labels. Args: X_test_text (pd.Series): Test text data. Returns: list: A list of NumPy arrays. Each array corresponds to a label column and contains the probability distribution over classes for each sample. """ probabilities = [] for col in LABEL_COLUMNS: if col in self.models: probabilities.append(self.models[col].predict_proba(X_test_text)) else: print(f"Warning: Model for {col} not found, cannot predict probabilities.") probabilities.append(np.array([])) return probabilities def evaluate(self, X_test_text, y_test_df): """ Evaluates the models and returns classification reports. Args: X_test_text (pd.Series): Test text data. y_test_df (pd.DataFrame): DataFrame of true test labels (encoded). Returns: tuple: A tuple containing: - reports (dict): Classification reports for each label column. - truths (list): List of true label arrays. - preds (list): List of predicted label arrays. """ reports = {} truths = [[] for _ in range(len(LABEL_COLUMNS))] preds = [[] for _ in range(len(LABEL_COLUMNS))] for i, col in enumerate(LABEL_COLUMNS): if col in self.models: y_pred = self.models[col].predict(X_test_text) y_true = y_test_df[col].values try: report = classification_report(y_true, y_pred, output_dict=True, zero_division=0) reports[col] = report except ValueError: print(f"Warning: Could not generate classification report for {col}. Skipping.") reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}} truths[i].extend(y_true) preds[i].extend(y_pred) else: print(f"Warning: Model for {col} not found for evaluation.") return reports, truths, preds def save_model(self, model_name="tfidf_lgbm", save_format='pickle'): """ Saves the trained TF-IDF LightGBM models. Args: model_name (str): The base name for the saved model file. save_format (str): Format to save the model in (default: 'pickle'). """ if save_format != 'pickle': raise ValueError("TF-IDF models only support 'pickle' format") save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") joblib.dump(self.models, save_path) print(f"TF-IDF LightGBM models saved to {save_path}") def load_model(self, model_name="tfidf_lgbm"): """ Loads trained TF-IDF LightGBM models from a file. Args: model_name (str): The base name of the model file to load. """ load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") if os.path.exists(load_path): self.models = joblib.load(load_path) print(f"TF-IDF LightGBM models loaded from {load_path}") else: print(f"Error: Model file not found at {load_path}. Initialize models as empty.") self.models = {}