Spaces:

point9
/

ml_tfidf_lgbm_projects

Running

App Files Files Community

ml_tfidf_lgbm_projects / train_lgbm.py

subbunanepalli

Update train_lgbm.py

12e3bc5 verified 3 months ago

raw

history blame contribute delete

2.05 kB


	import os
	import pandas as pd
	import joblib
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.multioutput import MultiOutputClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import LabelEncoder

	from config import (
	DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
	MODEL_SAVE_DIR, LABEL_ENCODERS_PATH,
	TFIDF_MAX_FEATURES, NGRAM_RANGE,
	USE_STOPWORDS, RANDOM_STATE, TEST_SIZE
	)

	# Ensure required directories are created
	os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

	print(" Loading dataset...")
	df = pd.read_csv(DATA_PATH)
	df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)

	# Encode each target label
	label_encoders = {}
	for col in LABEL_COLUMNS:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le

	X = df[TEXT_COLUMN]
	Y = df[LABEL_COLUMNS]

	print("✂️ Splitting train/test...")
	X_train, X_test, y_train, y_test = train_test_split(
	X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE
	)

	print(" Building pipeline...")
	stop_words = "english" if USE_STOPWORDS else None
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer(
	max_features=TFIDF_MAX_FEATURES,
	ngram_range=NGRAM_RANGE,
	stop_words=stop_words
	)),
	('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)))
	])

	print(" Training model...")
	pipeline.fit(X_train, y_train)

	# Save full pipeline
	model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
	print(f" Saving model to {model_path}")
	joblib.dump(pipeline, model_path)

	# Save label encoders
	print(f" Saving label encoders to {LABEL_ENCODERS_PATH}")
	joblib.dump(label_encoders, LABEL_ENCODERS_PATH)

	# Save TF-IDF separately (optional)
	tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl")
	print(f" Saving TF-IDF vectorizer to {tfidf_path}")
	joblib.dump(pipeline.named_steps["tfidf"], tfidf_path)

	print(" Training complete.")