Spaces:

point9
/

LGBM

Sleeping

File size: 6,536 Bytes

22539ef

# tfidf_based_models/tfidf_lgbm.py

from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
import joblib
import os

from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR

class TfidfLightGBM:
    """
    TF-IDF based LightGBM model for multi-output classification.
    It trains a separate LightGBM classifier for each target label
    after converting text data into TF-IDF features.
    """
    def __init__(self, label_encoders):
        """
        Initializes the TfidfLightGBM model.

        Args:
            label_encoders (dict): A dictionary of LabelEncoder objects.
        """
        self.label_encoders = label_encoders
        self.models = {} # Stores the trained Pipeline for each label

    def train(self, X_train_text, y_train_df):
        """
        Trains a TF-IDF + LightGBM pipeline for each label.

        Args:
            X_train_text (pd.Series): Training text data.
            y_train_df (pd.DataFrame): DataFrame of training labels (encoded).
        """
        print("Training TF-IDF + LightGBM models...")
        for i, col in enumerate(LABEL_COLUMNS):
            print(f"  Training for {col}...")
            num_classes = len(self.label_encoders[col].classes_)
            # Determine LightGBM objective based on number of classes
            objective = 'multiclass' if num_classes > 2 else 'binary'
            # `num_class` parameter is required for 'multiclass' objective
            num_class_param = {'num_class': num_classes} if num_classes > 2 else {}

            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)),
                ('lgbm', lgb.LGBMClassifier(
                    objective=objective,
                    **num_class_param, # Unpack num_class_param if it's not empty
                    random_state=42,
                    n_estimators=100
                ))
            ])
            # Fit the pipeline on the training data.
            # LightGBM handles class imbalance with `is_unbalance=True` or `scale_pos_weight`
            # for binary classification, or implicitly for multiclass with default settings.
            pipeline.fit(X_train_text, y_train_df[col])
            self.models[col] = pipeline
        print("TF-IDF + LightGBM training complete.")

    def predict(self, X_test_text):
        """
        Makes class predictions for all labels.

        Args:
            X_test_text (pd.Series): Test text data.

        Returns:
            dict: A dictionary where keys are label names and values are NumPy arrays
                  of predicted class indices.
        """
        predictions = {}
        for col, model_pipeline in self.models.items():
            predictions[col] = model_pipeline.predict(X_test_text)
        return predictions

    def predict_proba(self, X_test_text):
        """
        Returns prediction probabilities for each class for all labels.

        Args:
            X_test_text (pd.Series): Test text data.

        Returns:
            list: A list of NumPy arrays. Each array corresponds to a label column
                  and contains the probability distribution over classes for each sample.
        """
        probabilities = []
        for col in LABEL_COLUMNS:
            if col in self.models:
                probabilities.append(self.models[col].predict_proba(X_test_text))
            else:
                print(f"Warning: Model for {col} not found, cannot predict probabilities.")
                probabilities.append(np.array([]))
        return probabilities

    def evaluate(self, X_test_text, y_test_df):
        """
        Evaluates the models and returns classification reports.

        Args:
            X_test_text (pd.Series): Test text data.
            y_test_df (pd.DataFrame): DataFrame of true test labels (encoded).

        Returns:
            tuple: A tuple containing:
                - reports (dict): Classification reports for each label column.
                - truths (list): List of true label arrays.
                - preds (list): List of predicted label arrays.
        """
        reports = {}
        truths = [[] for _ in range(len(LABEL_COLUMNS))]
        preds = [[] for _ in range(len(LABEL_COLUMNS))]

        for i, col in enumerate(LABEL_COLUMNS):
            if col in self.models:
                y_pred = self.models[col].predict(X_test_text)
                y_true = y_test_df[col].values
                try:
                    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
                    reports[col] = report
                except ValueError:
                    print(f"Warning: Could not generate classification report for {col}. Skipping.")
                    reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
                truths[i].extend(y_true)
                preds[i].extend(y_pred)
            else:
                print(f"Warning: Model for {col} not found for evaluation.")

        return reports, truths, preds

    def save_model(self, model_name="tfidf_lgbm", save_format='pickle'):
        """
        Saves the trained TF-IDF LightGBM models.

        Args:
            model_name (str): The base name for the saved model file.
            save_format (str): Format to save the model in (default: 'pickle').
        """
        if save_format != 'pickle':
            raise ValueError("TF-IDF models only support 'pickle' format")
        save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
        joblib.dump(self.models, save_path)
        print(f"TF-IDF LightGBM models saved to {save_path}")

    def load_model(self, model_name="tfidf_lgbm"):
        """
        Loads trained TF-IDF LightGBM models from a file.

        Args:
            model_name (str): The base name of the model file to load.
        """
        load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
        if os.path.exists(load_path):
            self.models = joblib.load(load_path)
            print(f"TF-IDF LightGBM models loaded from {load_path}")
        else:
            print(f"Error: Model file not found at {load_path}. Initialize models as empty.")
            self.models = {}