File size: 4,855 Bytes
5fc6e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import warnings

from loguru import logger
from numpy import ndarray
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

from ..baseModel import BaseModel

warnings.filterwarnings("ignore")


class RandomForestTfIdf(BaseModel):
    """
    Sklearn implementation of BaseModel with integrated Grid Search.
    Builds a TF-IDF + RandomForest pipeline for multi-output text classification.
    """

    def __init__(self, language, path=None):
        """
        Initialize the RandomForestTfIdf model with configuration parameters.

        Args:
            language (str): Language for the model.
            path (str, optional): Path to load a pre-trained model. Defaults to None.
                                    If None, a new model is initialized.
        """

        self.params = {"stop_words": "english", "random_state": 42, "cv_folds": 5}

        self.grid_params = {
            "clf__estimator__n_estimators": [50, 100, 200],
            "clf__estimator__max_depth": [None, 10, 20],
            "tfidf__max_features": [3000, 5000, 8000],
        }

        super().__init__(language, path)

    def setup_model(self):
        """
        Initialize the scikit-learn pipeline with TF-IDF vectorizer and RandomForest classifier.
        """

        base_estimator = RandomForestClassifier(
            random_state=self.params["random_state"], n_jobs=-1
        )

        self.pipeline = Pipeline(
            [
                (
                    "tfidf",
                    TfidfVectorizer(ngram_range=(1, 2), stop_words=self.params["stop_words"]),
                ),
                ("clf", MultiOutputClassifier(base_estimator, n_jobs=-1)),
            ]
        )

        self.model = self.pipeline
        logger.info("Scikit-learn pipeline initialized.")

    def train(self, X_train, y_train) -> dict[str, any]:
        """
        Train the model using Grid Search to find the best hyperparameters.

        Args:
            X_train: Input training data.
            y_train: True labels for training data.
        """

        if self.model is None:
            raise ValueError(
                "Model pipeline is not initialized. Call setup_model() before training."
            )

        logger.info(f"Starting training for: {self.language.upper()}")
        logger.info("Performing Grid Search for best hyperparameters...")
        grid_search = GridSearchCV(
            self.pipeline,
            param_grid=self.grid_params,
            cv=self.params["cv_folds"],
            scoring="f1_weighted",
            n_jobs=-1,
            verbose=1,
        )
        grid_search.fit(X_train, y_train)

        logger.success(f"Best params found: {grid_search.best_params_}")

        parameters_to_log = {
            "max_features": grid_search.best_params_["tfidf__max_features"],
            "n_estimators": grid_search.best_params_["clf__estimator__n_estimators"],
            "max_depth": grid_search.best_params_["clf__estimator__max_depth"],
        }

        self.model = grid_search.best_estimator_
        logger.success(f"Training for {self.language.upper()} completed.")

        return parameters_to_log

    def evaluate(self, X_test, y_test) -> dict[str, any]:
        """
        Evaluate model on test data and return metrics.

        Args:
            X_test: Input test data.
            y_test: True labels for test data.
        """

        y_pred = self.predict(X_test)

        report = classification_report(y_test, y_pred, zero_division=0)
        print("\n" + "=" * 50)
        print("CLASSIFICATION REPORT")
        print(report)
        print("=" * 50 + "\n")

        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
            "recall": recall_score(y_test, y_pred, average="macro", zero_division=0),
            "f1_score": f1_score(y_test, y_pred, average="weighted"),
        }

        logger.info(
            f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}"
        )
        return metrics

    def predict(self, X) -> ndarray:
        """
        Make predictions using the trained model.

        Args:
            X: Input data for prediction.

        Returns:
            Predictions made by the model.
        """

        if self.model is None:
            raise ValueError("Model is not trained. Call train() or load() before prediction.")

        return self.model.predict(X)