|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
import lightgbm as lgb |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.metrics import classification_report |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.utils.class_weight import compute_class_weight |
|
import numpy as np |
|
import pandas as pd |
|
import joblib |
|
import os |
|
|
|
from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR |
|
|
|
class TfidfLightGBM: |
|
""" |
|
TF-IDF based LightGBM model for multi-output classification. |
|
It trains a separate LightGBM classifier for each target label |
|
after converting text data into TF-IDF features. |
|
""" |
|
def __init__(self, label_encoders): |
|
""" |
|
Initializes the TfidfLightGBM model. |
|
|
|
Args: |
|
label_encoders (dict): A dictionary of LabelEncoder objects. |
|
""" |
|
self.label_encoders = label_encoders |
|
self.models = {} |
|
|
|
def train(self, X_train_text, y_train_df): |
|
""" |
|
Trains a TF-IDF + LightGBM pipeline for each label. |
|
|
|
Args: |
|
X_train_text (pd.Series): Training text data. |
|
y_train_df (pd.DataFrame): DataFrame of training labels (encoded). |
|
""" |
|
print("Training TF-IDF + LightGBM models...") |
|
for i, col in enumerate(LABEL_COLUMNS): |
|
print(f" Training for {col}...") |
|
num_classes = len(self.label_encoders[col].classes_) |
|
|
|
objective = 'multiclass' if num_classes > 2 else 'binary' |
|
|
|
num_class_param = {'num_class': num_classes} if num_classes > 2 else {} |
|
|
|
pipeline = Pipeline([ |
|
('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)), |
|
('lgbm', lgb.LGBMClassifier( |
|
objective=objective, |
|
**num_class_param, |
|
random_state=42, |
|
n_estimators=100 |
|
)) |
|
]) |
|
|
|
|
|
|
|
pipeline.fit(X_train_text, y_train_df[col]) |
|
self.models[col] = pipeline |
|
print("TF-IDF + LightGBM training complete.") |
|
|
|
def predict(self, X_test_text): |
|
""" |
|
Makes class predictions for all labels. |
|
|
|
Args: |
|
X_test_text (pd.Series): Test text data. |
|
|
|
Returns: |
|
dict: A dictionary where keys are label names and values are NumPy arrays |
|
of predicted class indices. |
|
""" |
|
predictions = {} |
|
for col, model_pipeline in self.models.items(): |
|
predictions[col] = model_pipeline.predict(X_test_text) |
|
return predictions |
|
|
|
def predict_proba(self, X_test_text): |
|
""" |
|
Returns prediction probabilities for each class for all labels. |
|
|
|
Args: |
|
X_test_text (pd.Series): Test text data. |
|
|
|
Returns: |
|
list: A list of NumPy arrays. Each array corresponds to a label column |
|
and contains the probability distribution over classes for each sample. |
|
""" |
|
probabilities = [] |
|
for col in LABEL_COLUMNS: |
|
if col in self.models: |
|
probabilities.append(self.models[col].predict_proba(X_test_text)) |
|
else: |
|
print(f"Warning: Model for {col} not found, cannot predict probabilities.") |
|
probabilities.append(np.array([])) |
|
return probabilities |
|
|
|
def evaluate(self, X_test_text, y_test_df): |
|
""" |
|
Evaluates the models and returns classification reports. |
|
|
|
Args: |
|
X_test_text (pd.Series): Test text data. |
|
y_test_df (pd.DataFrame): DataFrame of true test labels (encoded). |
|
|
|
Returns: |
|
tuple: A tuple containing: |
|
- reports (dict): Classification reports for each label column. |
|
- truths (list): List of true label arrays. |
|
- preds (list): List of predicted label arrays. |
|
""" |
|
reports = {} |
|
truths = [[] for _ in range(len(LABEL_COLUMNS))] |
|
preds = [[] for _ in range(len(LABEL_COLUMNS))] |
|
|
|
for i, col in enumerate(LABEL_COLUMNS): |
|
if col in self.models: |
|
y_pred = self.models[col].predict(X_test_text) |
|
y_true = y_test_df[col].values |
|
try: |
|
report = classification_report(y_true, y_pred, output_dict=True, zero_division=0) |
|
reports[col] = report |
|
except ValueError: |
|
print(f"Warning: Could not generate classification report for {col}. Skipping.") |
|
reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}} |
|
truths[i].extend(y_true) |
|
preds[i].extend(y_pred) |
|
else: |
|
print(f"Warning: Model for {col} not found for evaluation.") |
|
|
|
return reports, truths, preds |
|
|
|
def save_model(self, model_name="tfidf_lgbm", save_format='pickle'): |
|
""" |
|
Saves the trained TF-IDF LightGBM models. |
|
|
|
Args: |
|
model_name (str): The base name for the saved model file. |
|
save_format (str): Format to save the model in (default: 'pickle'). |
|
""" |
|
if save_format != 'pickle': |
|
raise ValueError("TF-IDF models only support 'pickle' format") |
|
save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") |
|
joblib.dump(self.models, save_path) |
|
print(f"TF-IDF LightGBM models saved to {save_path}") |
|
|
|
def load_model(self, model_name="tfidf_lgbm"): |
|
""" |
|
Loads trained TF-IDF LightGBM models from a file. |
|
|
|
Args: |
|
model_name (str): The base name of the model file to load. |
|
""" |
|
load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") |
|
if os.path.exists(load_path): |
|
self.models = joblib.load(load_path) |
|
print(f"TF-IDF LightGBM models loaded from {load_path}") |
|
else: |
|
print(f"Error: Model file not found at {load_path}. Initialize models as empty.") |
|
self.models = {} |