AlvaroMros's picture
Add k-fold cross-validation to prediction pipeline
eb615ca
import argparse
from .pipeline import PredictionPipeline
from .models import (
EloBaselineModel,
LogisticRegressionModel,
XGBoostModel,
SVCModel,
RandomForestModel,
BernoulliNBModel,
LGBMModel
)
# --- Define Models to Run ---
# Instantiate all the models you want to evaluate here.
MODELS_TO_RUN = [
EloBaselineModel(),
LogisticRegressionModel(),
XGBoostModel(),
SVCModel(),
RandomForestModel(),
BernoulliNBModel(),
LGBMModel(),
]
# --- End of Model Definition ---
def main():
"""
Main entry point to run the prediction pipeline.
You can specify which models to run and the reporting format.
"""
parser = argparse.ArgumentParser(description="UFC Fight Prediction Pipeline")
parser.add_argument(
'--report',
type=str,
default='detailed',
choices=['detailed', 'summary'],
help="Type of report to generate: 'detailed' (file) or 'summary' (console)."
)
parser.add_argument(
'--use-existing-models',
action='store_true',
default=True,
help="Use existing saved models if available and no new data (default: True)."
)
parser.add_argument(
'--no-use-existing-models',
action='store_true',
default=False,
help="Force retrain all models from scratch, ignoring existing saved models."
)
parser.add_argument(
'--force-retrain',
action='store_true',
default=False,
help="Force retrain all models even if no new data is available."
)
parser.add_argument(
'--kfold',
action='store_true',
help='Run 3-fold CV instead of standard split.'
)
args = parser.parse_args()
# Handle conflicting arguments
use_existing_models = not args.no_use_existing_models and args.use_existing_models
force_retrain = args.force_retrain
if args.no_use_existing_models:
print("No-use-existing-models flag set: All models will be retrained from scratch.")
elif force_retrain:
print("Force-retrain flag set: All models will be retrained regardless of new data.")
elif use_existing_models:
print("Using existing models if available and no new data detected.")
# Use the already defined MODELS_TO_RUN from the top of the file
pipeline = PredictionPipeline(
models=MODELS_TO_RUN,
use_existing_models=use_existing_models,
force_retrain=force_retrain
)
try:
if args.kfold:
cv_results = pipeline.run_kfold_cv(k=3, holdout_events=1)
print(cv_results)
else:
pipeline.run(detailed_report=(args.report == 'detailed'))
except FileNotFoundError as e:
print(f"Error: {e}")
print("Please ensure the required data files exist. You may need to run the scraping and ELO analysis first.")
if __name__ == '__main__':
main()