Spaces:

point9
/

LGBM

Sleeping

App Files Files Community

LGBM / models /tfidf_lgbm.py

namanpenguin

Upload 9 files

22539ef verified 3 months ago

raw

history blame contribute delete

6.54 kB

	# tfidf_based_models/tfidf_lgbm.py

	from sklearn.feature_extraction.text import TfidfVectorizer
	import lightgbm as lgb
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import classification_report
	from sklearn.preprocessing import LabelEncoder
	from sklearn.utils.class_weight import compute_class_weight
	import numpy as np
	import pandas as pd
	import joblib
	import os

	from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR

	class TfidfLightGBM:
	"""
	TF-IDF based LightGBM model for multi-output classification.
	It trains a separate LightGBM classifier for each target label
	after converting text data into TF-IDF features.
	"""
	def __init__(self, label_encoders):
	"""
	Initializes the TfidfLightGBM model.

	Args:
	label_encoders (dict): A dictionary of LabelEncoder objects.
	"""
	self.label_encoders = label_encoders
	self.models = {} # Stores the trained Pipeline for each label

	def train(self, X_train_text, y_train_df):
	"""
	Trains a TF-IDF + LightGBM pipeline for each label.

	Args:
	X_train_text (pd.Series): Training text data.
	y_train_df (pd.DataFrame): DataFrame of training labels (encoded).
	"""
	print("Training TF-IDF + LightGBM models...")
	for i, col in enumerate(LABEL_COLUMNS):
	print(f" Training for {col}...")
	num_classes = len(self.label_encoders[col].classes_)
	# Determine LightGBM objective based on number of classes
	objective = 'multiclass' if num_classes > 2 else 'binary'
	# `num_class` parameter is required for 'multiclass' objective
	num_class_param = {'num_class': num_classes} if num_classes > 2 else {}

	pipeline = Pipeline([
	('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)),
	('lgbm', lgb.LGBMClassifier(
	objective=objective,
	**num_class_param, # Unpack num_class_param if it's not empty
	random_state=42,
	n_estimators=100
	))
	])
	# Fit the pipeline on the training data.
	# LightGBM handles class imbalance with `is_unbalance=True` or `scale_pos_weight`
	# for binary classification, or implicitly for multiclass with default settings.
	pipeline.fit(X_train_text, y_train_df[col])
	self.models[col] = pipeline
	print("TF-IDF + LightGBM training complete.")

	def predict(self, X_test_text):
	"""
	Makes class predictions for all labels.

	Args:
	X_test_text (pd.Series): Test text data.

	Returns:
	dict: A dictionary where keys are label names and values are NumPy arrays
	of predicted class indices.
	"""
	predictions = {}
	for col, model_pipeline in self.models.items():
	predictions[col] = model_pipeline.predict(X_test_text)
	return predictions

	def predict_proba(self, X_test_text):
	"""
	Returns prediction probabilities for each class for all labels.

	Args:
	X_test_text (pd.Series): Test text data.

	Returns:
	list: A list of NumPy arrays. Each array corresponds to a label column
	and contains the probability distribution over classes for each sample.
	"""
	probabilities = []
	for col in LABEL_COLUMNS:
	if col in self.models:
	probabilities.append(self.models[col].predict_proba(X_test_text))
	else:
	print(f"Warning: Model for {col} not found, cannot predict probabilities.")
	probabilities.append(np.array([]))
	return probabilities

	def evaluate(self, X_test_text, y_test_df):
	"""
	Evaluates the models and returns classification reports.

	Args:
	X_test_text (pd.Series): Test text data.
	y_test_df (pd.DataFrame): DataFrame of true test labels (encoded).

	Returns:
	tuple: A tuple containing:
	- reports (dict): Classification reports for each label column.
	- truths (list): List of true label arrays.
	- preds (list): List of predicted label arrays.
	"""
	reports = {}
	truths = [[] for _ in range(len(LABEL_COLUMNS))]
	preds = [[] for _ in range(len(LABEL_COLUMNS))]

	for i, col in enumerate(LABEL_COLUMNS):
	if col in self.models:
	y_pred = self.models[col].predict(X_test_text)
	y_true = y_test_df[col].values
	try:
	report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
	reports[col] = report
	except ValueError:
	print(f"Warning: Could not generate classification report for {col}. Skipping.")
	reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
	truths[i].extend(y_true)
	preds[i].extend(y_pred)
	else:
	print(f"Warning: Model for {col} not found for evaluation.")

	return reports, truths, preds

	def save_model(self, model_name="tfidf_lgbm", save_format='pickle'):
	"""
	Saves the trained TF-IDF LightGBM models.

	Args:
	model_name (str): The base name for the saved model file.
	save_format (str): Format to save the model in (default: 'pickle').
	"""
	if save_format != 'pickle':
	raise ValueError("TF-IDF models only support 'pickle' format")
	save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
	joblib.dump(self.models, save_path)
	print(f"TF-IDF LightGBM models saved to {save_path}")

	def load_model(self, model_name="tfidf_lgbm"):
	"""
	Loads trained TF-IDF LightGBM models from a file.

	Args:
	model_name (str): The base name of the model file to load.
	"""
	load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
	if os.path.exists(load_path):
	self.models = joblib.load(load_path)
	print(f"TF-IDF LightGBM models loaded from {load_path}")
	else:
	print(f"Error: Model file not found at {load_path}. Initialize models as empty.")
	self.models = {}