File size: 6,536 Bytes
22539ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# tfidf_based_models/tfidf_lgbm.py
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
import joblib
import os
from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR
class TfidfLightGBM:
"""
TF-IDF based LightGBM model for multi-output classification.
It trains a separate LightGBM classifier for each target label
after converting text data into TF-IDF features.
"""
def __init__(self, label_encoders):
"""
Initializes the TfidfLightGBM model.
Args:
label_encoders (dict): A dictionary of LabelEncoder objects.
"""
self.label_encoders = label_encoders
self.models = {} # Stores the trained Pipeline for each label
def train(self, X_train_text, y_train_df):
"""
Trains a TF-IDF + LightGBM pipeline for each label.
Args:
X_train_text (pd.Series): Training text data.
y_train_df (pd.DataFrame): DataFrame of training labels (encoded).
"""
print("Training TF-IDF + LightGBM models...")
for i, col in enumerate(LABEL_COLUMNS):
print(f" Training for {col}...")
num_classes = len(self.label_encoders[col].classes_)
# Determine LightGBM objective based on number of classes
objective = 'multiclass' if num_classes > 2 else 'binary'
# `num_class` parameter is required for 'multiclass' objective
num_class_param = {'num_class': num_classes} if num_classes > 2 else {}
pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)),
('lgbm', lgb.LGBMClassifier(
objective=objective,
**num_class_param, # Unpack num_class_param if it's not empty
random_state=42,
n_estimators=100
))
])
# Fit the pipeline on the training data.
# LightGBM handles class imbalance with `is_unbalance=True` or `scale_pos_weight`
# for binary classification, or implicitly for multiclass with default settings.
pipeline.fit(X_train_text, y_train_df[col])
self.models[col] = pipeline
print("TF-IDF + LightGBM training complete.")
def predict(self, X_test_text):
"""
Makes class predictions for all labels.
Args:
X_test_text (pd.Series): Test text data.
Returns:
dict: A dictionary where keys are label names and values are NumPy arrays
of predicted class indices.
"""
predictions = {}
for col, model_pipeline in self.models.items():
predictions[col] = model_pipeline.predict(X_test_text)
return predictions
def predict_proba(self, X_test_text):
"""
Returns prediction probabilities for each class for all labels.
Args:
X_test_text (pd.Series): Test text data.
Returns:
list: A list of NumPy arrays. Each array corresponds to a label column
and contains the probability distribution over classes for each sample.
"""
probabilities = []
for col in LABEL_COLUMNS:
if col in self.models:
probabilities.append(self.models[col].predict_proba(X_test_text))
else:
print(f"Warning: Model for {col} not found, cannot predict probabilities.")
probabilities.append(np.array([]))
return probabilities
def evaluate(self, X_test_text, y_test_df):
"""
Evaluates the models and returns classification reports.
Args:
X_test_text (pd.Series): Test text data.
y_test_df (pd.DataFrame): DataFrame of true test labels (encoded).
Returns:
tuple: A tuple containing:
- reports (dict): Classification reports for each label column.
- truths (list): List of true label arrays.
- preds (list): List of predicted label arrays.
"""
reports = {}
truths = [[] for _ in range(len(LABEL_COLUMNS))]
preds = [[] for _ in range(len(LABEL_COLUMNS))]
for i, col in enumerate(LABEL_COLUMNS):
if col in self.models:
y_pred = self.models[col].predict(X_test_text)
y_true = y_test_df[col].values
try:
report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
reports[col] = report
except ValueError:
print(f"Warning: Could not generate classification report for {col}. Skipping.")
reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
truths[i].extend(y_true)
preds[i].extend(y_pred)
else:
print(f"Warning: Model for {col} not found for evaluation.")
return reports, truths, preds
def save_model(self, model_name="tfidf_lgbm", save_format='pickle'):
"""
Saves the trained TF-IDF LightGBM models.
Args:
model_name (str): The base name for the saved model file.
save_format (str): Format to save the model in (default: 'pickle').
"""
if save_format != 'pickle':
raise ValueError("TF-IDF models only support 'pickle' format")
save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
joblib.dump(self.models, save_path)
print(f"TF-IDF LightGBM models saved to {save_path}")
def load_model(self, model_name="tfidf_lgbm"):
"""
Loads trained TF-IDF LightGBM models from a file.
Args:
model_name (str): The base name of the model file to load.
"""
load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
if os.path.exists(load_path):
self.models = joblib.load(load_path)
print(f"TF-IDF LightGBM models loaded from {load_path}")
else:
print(f"Error: Model file not found at {load_path}. Initialize models as empty.")
self.models = {} |