File size: 3,067 Bytes
0080558
106810d
 
0080558
 
106810d
57d763d
 
 
 
 
 
 
 
 
 
0080558
 
fb504f2
57d763d
 
 
fb504f2
57d763d
 
0080558
106810d
423e505
 
57d763d
 
423e505
106810d
423e505
57d763d
106810d
423e505
106810d
 
 
0080558
106810d
 
0080558
106810d
 
 
 
 
57d763d
 
 
 
106810d
 
 
fb504f2
57d763d
106810d
57d763d
 
 
fb504f2
57d763d
 
0080558
106810d
 
 
 
 
 
57d763d
 
 
106810d
 
 
57d763d
106810d
 
fb504f2
57d763d
 
 
fb504f2
57d763d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump
import pandas as pd
import numpy as np
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Define paths
DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "enhanced_mantle_training.csv")
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", "heating_model_with_risk_score.pkl")

# Load the enhanced dataset
try:
    df = pd.read_csv(DATA_PATH)
    logger.info("Dataset loaded successfully!")
    logger.info(f"Dataset head:\n{df.head().to_string()}")
except FileNotFoundError:
    logger.error(f"Error: '{DATA_PATH}' not found. Please generate the dataset using generate_data.py.")
    exit(1)

# Ensure required columns
required_columns = ["temperature", "duration", "risk_level"]
if not all(col in df.columns for col in required_columns):
    logger.error(f"Error: Missing one or more required columns. Ensure the dataset contains {required_columns}.")
    exit(1)

# Check for missing values
if df.isnull().any().any():
    logger.warning("Dataset contains missing values. Dropping rows with missing data.")
    df = df.dropna()

# Prepare features and target
X = df[["temperature", "duration"]]
y = df["risk_level"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Random Forest model
model = RandomForestClassifier(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform GridSearchCV
try:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    logger.info("Grid search completed successfully!")
    logger.info(f"Best parameters: {grid_search.best_params_}")
    logger.info(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
except Exception as e:
    logger.error(f"Error during model training: {e}")
    exit(1)

# Use the best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
logger.info(f"Test set accuracy: {accuracy:.4f}")
logger.info("\nClassification Report:")
logger.info(classification_report(y_test, y_pred))

# Ensure accuracy > 95%
if accuracy < 0.95:
    logger.warning("Model accuracy is below 95%. Consider generating more data or adjusting model parameters.")

# Save the best model
try:
    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
    dump(best_model, MODEL_PATH)
    logger.info(f"Model training complete! Model saved as '{MODEL_PATH}'.")
except Exception as e:
    logger.error(f"Error saving the model: {e}")
    exit(1)