File size: 6,536 Bytes
22539ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# tfidf_based_models/tfidf_lgbm.py

from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
import joblib
import os

from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR

class TfidfLightGBM:
    """
    TF-IDF based LightGBM model for multi-output classification.
    It trains a separate LightGBM classifier for each target label
    after converting text data into TF-IDF features.
    """
    def __init__(self, label_encoders):
        """
        Initializes the TfidfLightGBM model.

        Args:
            label_encoders (dict): A dictionary of LabelEncoder objects.
        """
        self.label_encoders = label_encoders
        self.models = {} # Stores the trained Pipeline for each label

    def train(self, X_train_text, y_train_df):
        """
        Trains a TF-IDF + LightGBM pipeline for each label.

        Args:
            X_train_text (pd.Series): Training text data.
            y_train_df (pd.DataFrame): DataFrame of training labels (encoded).
        """
        print("Training TF-IDF + LightGBM models...")
        for i, col in enumerate(LABEL_COLUMNS):
            print(f"  Training for {col}...")
            num_classes = len(self.label_encoders[col].classes_)
            # Determine LightGBM objective based on number of classes
            objective = 'multiclass' if num_classes > 2 else 'binary'
            # `num_class` parameter is required for 'multiclass' objective
            num_class_param = {'num_class': num_classes} if num_classes > 2 else {}

            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)),
                ('lgbm', lgb.LGBMClassifier(
                    objective=objective,
                    **num_class_param, # Unpack num_class_param if it's not empty
                    random_state=42,
                    n_estimators=100
                ))
            ])
            # Fit the pipeline on the training data.
            # LightGBM handles class imbalance with `is_unbalance=True` or `scale_pos_weight`
            # for binary classification, or implicitly for multiclass with default settings.
            pipeline.fit(X_train_text, y_train_df[col])
            self.models[col] = pipeline
        print("TF-IDF + LightGBM training complete.")

    def predict(self, X_test_text):
        """
        Makes class predictions for all labels.

        Args:
            X_test_text (pd.Series): Test text data.

        Returns:
            dict: A dictionary where keys are label names and values are NumPy arrays
                  of predicted class indices.
        """
        predictions = {}
        for col, model_pipeline in self.models.items():
            predictions[col] = model_pipeline.predict(X_test_text)
        return predictions

    def predict_proba(self, X_test_text):
        """
        Returns prediction probabilities for each class for all labels.

        Args:
            X_test_text (pd.Series): Test text data.

        Returns:
            list: A list of NumPy arrays. Each array corresponds to a label column
                  and contains the probability distribution over classes for each sample.
        """
        probabilities = []
        for col in LABEL_COLUMNS:
            if col in self.models:
                probabilities.append(self.models[col].predict_proba(X_test_text))
            else:
                print(f"Warning: Model for {col} not found, cannot predict probabilities.")
                probabilities.append(np.array([]))
        return probabilities

    def evaluate(self, X_test_text, y_test_df):
        """
        Evaluates the models and returns classification reports.

        Args:
            X_test_text (pd.Series): Test text data.
            y_test_df (pd.DataFrame): DataFrame of true test labels (encoded).

        Returns:
            tuple: A tuple containing:
                - reports (dict): Classification reports for each label column.
                - truths (list): List of true label arrays.
                - preds (list): List of predicted label arrays.
        """
        reports = {}
        truths = [[] for _ in range(len(LABEL_COLUMNS))]
        preds = [[] for _ in range(len(LABEL_COLUMNS))]

        for i, col in enumerate(LABEL_COLUMNS):
            if col in self.models:
                y_pred = self.models[col].predict(X_test_text)
                y_true = y_test_df[col].values
                try:
                    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
                    reports[col] = report
                except ValueError:
                    print(f"Warning: Could not generate classification report for {col}. Skipping.")
                    reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
                truths[i].extend(y_true)
                preds[i].extend(y_pred)
            else:
                print(f"Warning: Model for {col} not found for evaluation.")

        return reports, truths, preds

    def save_model(self, model_name="tfidf_lgbm", save_format='pickle'):
        """
        Saves the trained TF-IDF LightGBM models.

        Args:
            model_name (str): The base name for the saved model file.
            save_format (str): Format to save the model in (default: 'pickle').
        """
        if save_format != 'pickle':
            raise ValueError("TF-IDF models only support 'pickle' format")
        save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
        joblib.dump(self.models, save_path)
        print(f"TF-IDF LightGBM models saved to {save_path}")

    def load_model(self, model_name="tfidf_lgbm"):
        """
        Loads trained TF-IDF LightGBM models from a file.

        Args:
            model_name (str): The base name of the model file to load.
        """
        load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
        if os.path.exists(load_path):
            self.models = joblib.load(load_path)
            print(f"TF-IDF LightGBM models loaded from {load_path}")
        else:
            print(f"Error: Model file not found at {load_path}. Initialize models as empty.")
            self.models = {}