Spaces:

point9
/

ml_tfidf_lgbm_projects

Running

File size: 2,046 Bytes

12e3bc5
b5b4a63
 
 
 
 
12e3bc5
b5b4a63
 
 
 
 
 
 
 
 
 
 
12e3bc5
 
 
b5b4a63
 
 
 
12e3bc5
b5b4a63
 
 
 
 
 
 
 
 
12e3bc5
b5b4a63
 
 
 
12e3bc5
b5b4a63
 
 
 
 
 
 
12e3bc5
b5b4a63
 
12e3bc5
b5b4a63
 
12e3bc5
 
b5b4a63
 
 
 
 
 
 
12e3bc5
b5b4a63
 
12e3bc5
b5b4a63


import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from config import (
    DATA_PATH, TEXT_COLUMN, LABEL_COLUMNS,
    MODEL_SAVE_DIR, LABEL_ENCODERS_PATH,
    TFIDF_MAX_FEATURES, NGRAM_RANGE,
    USE_STOPWORDS, RANDOM_STATE, TEST_SIZE
)

#  Ensure required directories are created
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

print(" Loading dataset...")
df = pd.read_csv(DATA_PATH)
df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)

#  Encode each target label
label_encoders = {}
for col in LABEL_COLUMNS:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df[TEXT_COLUMN]
Y = df[LABEL_COLUMNS]

print("✂️ Splitting train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(" Building pipeline...")
stop_words = "english" if USE_STOPWORDS else None
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=TFIDF_MAX_FEATURES,
        ngram_range=NGRAM_RANGE,
        stop_words=stop_words
    )),
    ('clf', MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)))
])

print(" Training model...")
pipeline.fit(X_train, y_train)

#  Save full pipeline
model_path = os.path.join(MODEL_SAVE_DIR, "logreg_model.pkl")
print(f" Saving model to {model_path}")
joblib.dump(pipeline, model_path)

#  Save label encoders
print(f" Saving label encoders to {LABEL_ENCODERS_PATH}")
joblib.dump(label_encoders, LABEL_ENCODERS_PATH)

#  Save TF-IDF separately (optional)
tfidf_path = os.path.join(MODEL_SAVE_DIR, "tfidf_vectorizer.pkl")
print(f" Saving TF-IDF vectorizer to {tfidf_path}")
joblib.dump(pipeline.named_steps["tfidf"], tfidf_path)

print(" Training complete.")