File size: 18,519 Bytes
8a644b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e012a04
 
 
 
38c6a34
1e04613
9678fdb
e012a04
eb615ca
 
 
e012a04
 
 
 
 
5271c2e
e012a04
 
 
 
 
 
5271c2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e012a04
5271c2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f972c61
e012a04
 
 
 
 
f972c61
e012a04
 
 
 
 
 
 
 
 
 
5271c2e
e012a04
f972c61
 
 
 
 
 
 
 
 
1e04613
e012a04
 
 
 
 
 
 
5271c2e
 
 
e012a04
 
 
5271c2e
 
 
 
 
 
 
 
 
 
 
 
e012a04
 
 
 
 
 
 
 
38c6a34
bf7e729
f3ecc65
 
 
 
e012a04
 
 
 
 
 
38c6a34
e012a04
f3ecc65
e012a04
 
 
 
 
5271c2e
e012a04
 
 
5271c2e
 
e012a04
 
 
 
 
 
 
5271c2e
 
 
1e04613
eb615ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3994c21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e04613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5271c2e
 
 
 
 
 
 
1e04613
 
 
 
 
 
 
 
 
 
 
5271c2e
 
 
 
 
e012a04
 
 
5271c2e
 
e012a04
5271c2e
 
 
e012a04
38c6a34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5271c2e
38c6a34
 
 
 
 
 
 
 
 
e012a04
38c6a34
e012a04
38c6a34
 
 
f972c61
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
"""
UFC Fight Prediction Pipeline

Copyright (C) 2025 Alvaro

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import csv
import os
from datetime import datetime
from collections import OrderedDict
import json
import joblib
from ..config import FIGHTS_CSV_PATH, MODEL_RESULTS_PATH, MODELS_DIR, LAST_EVENT_JSON_PATH
from .models import BaseModel
from sklearn.model_selection import KFold
import mlflow
import mlflow.sklearn

class PredictionPipeline:
    """
    Orchestrates the model training, evaluation, and reporting pipeline.
    """
    def __init__(self, models, use_existing_models=True, force_retrain=False):
        if not all(isinstance(m, BaseModel) for m in models):
            raise TypeError("All models must be instances of BaseModel.")
        self.models = models
        self.train_fights = []
        self.test_fights = []
        self.results = {}
        self.use_existing_models = use_existing_models
        self.force_retrain = force_retrain

    def _get_last_trained_event(self):
        """Get the last event that models were trained on."""
        if not os.path.exists(LAST_EVENT_JSON_PATH):
            return None
        try:
            with open(LAST_EVENT_JSON_PATH, 'r', encoding='utf-8') as f:
                last_event_data = json.load(f)
                if isinstance(last_event_data, list) and len(last_event_data) > 0:
                    return last_event_data[0].get('name'), last_event_data[0].get('date')
                return None, None
        except (json.JSONDecodeError, FileNotFoundError):
            return None, None

    def _save_last_trained_event(self, event_name, event_date):
        """Save the last event that models were trained on."""
        last_event_data = [{
            "name": event_name,
            "date": event_date,
            "training_timestamp": datetime.now().isoformat()
        }]
        try:
            with open(LAST_EVENT_JSON_PATH, 'w', encoding='utf-8') as f:
                json.dump(last_event_data, f, indent=4)
        except Exception as e:
            print(f"Warning: Could not save last trained event: {e}")

    def _has_new_data_since_last_training(self):
        """Check if there's new fight data since the last training."""
        last_event_name, last_event_date = self._get_last_trained_event()
        if not last_event_name or not last_event_date:
            return True  # No previous training record, consider as new data
        
        if not os.path.exists(FIGHTS_CSV_PATH):
            return False
        
        with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
            fights = list(csv.DictReader(f))
        
        if not fights:
            return False
        
        # Sort fights by date to get the latest event
        fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
        latest_fight = fights[-1]
        latest_event_name = latest_fight['event_name']
        latest_event_date = latest_fight['event_date']
        
        # Check if we have new events since last training
        if latest_event_name != last_event_name:
            print(f"New data detected: Latest event '{latest_event_name}' differs from last trained event '{last_event_name}'")
            return True
        
        return False

    def _model_exists(self, model):
        """Check if a saved model file exists and can be loaded successfully."""
        model_name = model.__class__.__name__
        file_name = f"{model_name}.joblib"
        save_path = os.path.join(MODELS_DIR, file_name)
        
        if not os.path.exists(save_path):
            return False
        
        # Verify the model can actually be loaded
        try:
            joblib.load(save_path)
            return True
        except Exception as e:
            print(f"Warning: Model file {file_name} exists but cannot be loaded ({e}). Will retrain.")
            return False

    def _load_existing_model(self, model_class):
        """Load an existing model from disk."""
        model_name = model_class.__name__
        file_name = f"{model_name}.joblib"
        load_path = os.path.join(MODELS_DIR, file_name)
        
        try:
            loaded_model = joblib.load(load_path)
            print(f"Loaded existing model: {model_name}")
            return loaded_model
        except Exception as e:
            print(f"Error loading model {model_name}: {e}")
            return None

    def _should_retrain_models(self):
        """Determine if models should be retrained."""
        if self.force_retrain:
            print("Force retrain flag is set. Retraining all models.")
            return True
        
        if not self.use_existing_models:
            print("Use existing models flag is disabled. Retraining all models.")
            return True
        
        # Check if any model files are missing
        missing_models = [m for m in self.models if not self._model_exists(m)]
        if missing_models:
            missing_names = [m.__class__.__name__ for m in missing_models]
            print(f"Missing model files for: {missing_names}. Retraining all models.")
            return True
        
        # Check if there's new data since last training
        if self._has_new_data_since_last_training():
            return True
        
        print("No new data detected and all model files exist. Using existing models.")
        return False

    def _load_and_split_data(self, num_test_events: int = 1) -> None:
        """Loads and splits the data into chronological training and testing sets."""
        print("\n--- Loading and Splitting Data ---")
        if not os.path.exists(FIGHTS_CSV_PATH):
            raise FileNotFoundError(f"Fights data not found at '{FIGHTS_CSV_PATH}'.")

        fights = self._load_fights()
        
        all_events = list(OrderedDict.fromkeys(f['event_name'] for f in fights))
        if len(all_events) < num_test_events:
            print(f"Warning: Fewer than {num_test_events} events found. Adjusting test set size.")
            num_test_events = len(all_events)
            
        test_event_names = all_events[-num_test_events:]
        self.train_fights = [f for f in fights if f['event_name'] not in test_event_names]
        self.test_fights = [f for f in fights if f['event_name'] in test_event_names]
        print(f"Data loaded. {len(self.train_fights)} training fights, {len(self.test_fights)} testing fights.")
        print(f"Testing on the last {num_test_events} event(s): {', '.join(test_event_names)}")

    def _load_fights(self) -> list:
        """Helper method to load and sort fights from CSV."""
        with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
            fights = list(csv.DictReader(f))
        
        fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
        return fights

    def run(self, detailed_report: bool = True) -> None:
        """Executes the full pipeline: load, train, evaluate, report and save models."""
        self._load_and_split_data()
        
        eval_fights = [f for f in self.test_fights if f['winner'] not in ["Draw", "NC", ""]]
        if not eval_fights:
            print("No fights with definitive outcomes in the test set. Aborting.")
            return

        should_retrain = self._should_retrain_models()
        
        for i, model in enumerate(self.models):
            model_name = model.__class__.__name__
            print(f"\n--- Evaluating Model: {model_name} ---")
            
            if should_retrain:
                print(f"Training {model_name}...")
                model.train(self.train_fights)
            else:
                # Try to load existing model, fall back to training if loading fails
                loaded_model = self._load_existing_model(model.__class__)
                if loaded_model is not None:
                    # Replace the model instance with the loaded one
                    self.models[i] = loaded_model
                    model = loaded_model
                else:
                    print(f"Failed to load {model_name}, training new model...")
            model.train(self.train_fights)
            
            correct_predictions = 0
            predictions = []
            
            for fight in eval_fights:
                f1_name, f2_name = fight['fighter_1'], fight['fighter_2']
                actual_winner = fight['winner']
                event_name = fight.get('event_name', 'Unknown Event')
                
                prediction_result = model.predict(fight)
                predicted_winner = prediction_result.get('winner')
                probability = prediction_result.get('probability')

                is_correct = (predicted_winner == actual_winner)
                if is_correct:
                    correct_predictions += 1
                
                predictions.append({
                    'fight': f"{f1_name} vs. {f2_name}",
                    'event': event_name,
                    'predicted_winner': predicted_winner,
                    'probability': f"{probability:.1%}" if probability is not None else "N/A",
                    'actual_winner': actual_winner,
                    'is_correct': is_correct
                })
                
            accuracy = (correct_predictions / len(eval_fights)) * 100
            model_status = "retrained" if should_retrain else "loaded from disk"
            self.results[model_name] = {
                'accuracy': accuracy, 
                'predictions': predictions,
                'total_fights': len(eval_fights),
                'model_status': model_status
            }

        if detailed_report:
            self._report_detailed_results()
        else:
            self._report_summary()

        # Only train and save models if retraining was performed
        if should_retrain:
            self._train_and_save_models()

    def run_kfold_cv(self, k: int = 3, holdout_events: int = 1):
        """Performs k-fold cross-validation where each fold is a set of events.
        Within each fold, we keep the last *holdout_events* for testing."""
        fights = self._load_fights()

        # Build an ordered list of unique events
        event_list = list(OrderedDict.fromkeys(f['event_name'] for f in fights))

        # Initialize KFold splitter on events
        kf = KFold(n_splits=k, shuffle=True, random_state=42)

        all_fold_metrics = []
        for fold_idx, (train_event_idx, test_event_idx) in enumerate(kf.split(event_list), start=1):
            train_events = [event_list[i] for i in train_event_idx]

            # Collect fights that belong to the training events
            fold_fights = [f for f in fights if f['event_name'] in train_events]

            # Inside this fold, reserve the last `holdout_events` events for testing
            fold_events_ordered = list(OrderedDict.fromkeys(f['event_name'] for f in fold_fights))
            test_events = fold_events_ordered[-holdout_events:]

            train_set = [f for f in fold_fights if f['event_name'] not in test_events]
            test_set  = [f for f in fold_fights if f['event_name'] in test_events]

            # Start an MLflow run for the current fold
            mlflow.set_experiment("UFC_KFold_CV")
            with mlflow.start_run(run_name=f"fold_{fold_idx}"):
                # Log meta information about the fold
                mlflow.log_param("fold", fold_idx)
                mlflow.log_param("train_events", len(train_events))
                mlflow.log_param("test_events", holdout_events)

                fold_results = {}
                for model in self.models:
                    model_name = model.__class__.__name__

                    # Train and evaluate
                    model.train(train_set)
                    correct = 0
                    total_fights = 0
                    for fight in test_set:
                        if fight['winner'] not in ["Draw", "NC", ""]:
                            prediction = model.predict(fight)
                            if prediction.get('winner') == fight['winner']:
                                correct += 1
                            total_fights += 1

                    acc = correct / total_fights if total_fights > 0 else 0.0
                    fold_results[model_name] = acc

                    # Log metrics and register model to appear in MLflow Models tab
                    mlflow.log_metric(f"accuracy_{model_name}", acc)
                    mlflow.log_metric(f"total_fights_{model_name}", total_fights)
                    
                    # Register the model with MLflow to appear in Models tab
                    mlflow.sklearn.log_model(
                        model, 
                        f"model_{model_name}",
                        registered_model_name=f"{model_name}_UFC_Model"
                    )

                all_fold_metrics.append(fold_results)

        return all_fold_metrics

    def update_models_if_new_data(self):
        """
        Checks for new data and retrains/saves all models on the full dataset if needed.
        This does not run any evaluation.
        """
        print("\n--- Checking for Model Updates ---")
        
        # Check if any model files are missing or invalid
        missing_models = [m for m in self.models if not self._model_exists(m)]
        has_new_data = self._has_new_data_since_last_training()

        if missing_models:
            missing_names = [m.__class__.__name__ for m in missing_models]
            print(f"Missing or invalid model files found for: {missing_names}.")
            self._train_and_save_models()
        elif has_new_data:
            print("New data detected, retraining all models...")
            self._train_and_save_models()
        else:
            print("No new data detected. Models are already up-to-date.")

    def _train_and_save_models(self):
        """Trains all models on the full dataset and saves them."""
        print("\n\n--- Training and Saving All Models on Full Dataset ---")

        if not os.path.exists(FIGHTS_CSV_PATH):
            print(f"Error: Fights data not found at '{FIGHTS_CSV_PATH}'. Cannot save models.")
            return
            
        with open(FIGHTS_CSV_PATH, 'r', encoding='utf-8') as f:
            all_fights = list(csv.DictReader(f))
        
        print(f"Training models on all {len(all_fights)} available fights...")

        if not os.path.exists(MODELS_DIR):
            os.makedirs(MODELS_DIR)
            print(f"Created directory: {MODELS_DIR}")

        # Get the latest event info for tracking
        if all_fights:
            all_fights.sort(key=lambda x: datetime.strptime(x['event_date'], '%B %d, %Y'))
            latest_fight = all_fights[-1]
            latest_event_name = latest_fight['event_name']
            latest_event_date = latest_fight['event_date']

        for model in self.models:
            model_name = model.__class__.__name__
            print(f"\n--- Training: {model_name} ---")
            model.train(all_fights)
            
            # Sanitize and save the model
            file_name = f"{model_name}.joblib"
            save_path = os.path.join(MODELS_DIR, file_name)
            joblib.dump(model, save_path)
            print(f"Model saved successfully to {save_path}")

        # Save the last trained event info
        if all_fights:
            self._save_last_trained_event(latest_event_name, latest_event_date)
            print(f"Updated last trained event: {latest_event_name} ({latest_event_date})")

    def _report_summary(self):
        """Prints a concise summary of model performance."""
        print("\n\n--- Prediction Pipeline Summary ---")
        print(f"{'Model':<25} | {'Accuracy':<10} | {'Fights Evaluated':<20} | {'Status':<15}")
        print("-" * 80)
        for model_name, result in self.results.items():
            status = result.get('model_status', 'unknown')
            print(f"{model_name:<25} | {result['accuracy']:<9.2f}% | {result['total_fights']:<20} | {status:<15}")
        print("-" * 80)

    def _save_report_to_json(self, file_path=MODEL_RESULTS_PATH):
        """Saves the detailed prediction results to a JSON file."""
        print(f"\nSaving detailed report to {file_path}...")
        try:
            # Create a report structure that is clean and JSON-friendly
            report = {}
            for model_name, result in self.results.items():
                
                # Group predictions by event for a more organized report
                predictions_by_event = {}
                for p in result['predictions']:
                    event_name = p.pop('event') # Extract event and remove it from the sub-dictionary
                    if event_name not in predictions_by_event:
                        predictions_by_event[event_name] = []
                    predictions_by_event[event_name].append(p)

                report[model_name] = {
                    "overall_accuracy": f"{result['accuracy']:.2f}%",
                    "total_fights_evaluated": result['total_fights'],
                    "model_status": result.get('model_status', 'unknown'),
                    "predictions_by_event": predictions_by_event
                }

            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=4)
            print("Report saved successfully.")
        except (IOError, TypeError) as e:
            print(f"Error saving report to JSON file: {e}")

    def _report_detailed_results(self):
        """Prints a summary and saves the detailed report to a file."""
        print("\n\n--- Prediction Pipeline Finished: Detailed Report ---")
        # A summary is printed to the console for convenience.
        self._report_summary()
        # The detailed report is now saved to a JSON file.
        self._save_report_to_json()