Spaces:
Running
Running
import os | |
import pandas as pd | |
import joblib | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.multioutput import MultiOutputClassifier | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import LabelEncoder | |
from config import ( | |
DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS, | |
MODEL_SAVE_DIR, LABEL_ENCODERS_PATH, | |
TFIDF_MAX_FEATURES, NGRAM_RANGE, | |
USE_STOPWORDS, RANDOM_STATE, TEST_SIZE | |
) | |
# Ensure required directories are created | |
os.makedirs(MODEL_SAVE_DIR, exist_ok=True) | |
print(" Loading dataset...") | |
df = pd.read_csv(DATA_PATH) | |
df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True) | |
# Encode each target label | |
label_encoders = {} | |
for col in LABEL_COLUMNS: | |
le = LabelEncoder() | |
df[col] = le.fit_transform(df[col]) | |
label_encoders[col] = le | |
X = df[TEXT_COLUMN] | |
Y = df[LABEL_COLUMNS] | |
print("✂️ Splitting train/test...") | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE | |
) | |
print(" Building pipeline...") | |
stop_words = "english" if USE_STOPWORDS else None | |
pipeline = Pipeline([ | |
('tfidf', TfidfVectorizer( | |
max_features=TFIDF_MAX_FEATURES, | |
ngram_range=NGRAM_RANGE, | |
stop_words=stop_words | |
)), | |
('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))) | |
]) | |
print(" Training model...") | |
pipeline.fit(X_train, y_train) | |
# Save full pipeline | |
model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl") | |
print(f" Saving model to {model_path}") | |
joblib.dump(pipeline, model_path) | |
# Save label encoders | |
print(f" Saving label encoders to {LABEL_ENCODERS_PATH}") | |
joblib.dump(label_encoders, LABEL_ENCODERS_PATH) | |
# Save TF-IDF separately (optional) | |
tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl") | |
print(f" Saving TF-IDF vectorizer to {tfidf_path}") | |
joblib.dump(pipeline.named_steps["tfidf"], tfidf_path) | |
print(" Training complete.") | |