# -*- coding: utf-8 -*-
"""PPAS Model.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1COA86IG7byZ4AtM_kfAj3NY0q0PZ7pLb

# **Predictive Performance Analysis for Students**
This notebook leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps:

Predict Grade: Enter student data to get the predicted grade and risk level.
Run Scenario Simulation: Simulate interventions by increasing a selected feature to see the impact on the predicted grade.

Step 0: Data Generation

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset
n_students = 1000
data = {
    'Student ID': [f'S{i:03d}' for i in range(1, n_students + 1)],
    'Student Name': [f'Student {i}' for i in range(1, n_students + 1)],
    'Total Attendance (%)': np.random.uniform(50, 100, n_students),
    'Marks in Previous Exams (%)': np.random.uniform(40, 100, n_students),
    'Assignment Submission Rate (%)': np.random.uniform(50, 100, n_students),
    'Engagement Metrics (%)': np.random.uniform(50, 100, n_students),
    'Historical GPA': np.random.uniform(2.0, 4.0, n_students)
}

# Create DataFrame
df = pd.DataFrame(data)

# Generate Final Grade as a function of features with noise
df['Final Grade (%)'] = (
    0.2 * df['Total Attendance (%)'] +
    0.3 * df['Marks in Previous Exams (%)'] +
    0.2 * df['Assignment Submission Rate (%)'] +
    0.2 * df['Engagement Metrics (%)'] +
    0.1 * (df['Historical GPA'] * 25) +
    np.random.uniform(-5, 5, n_students)
)

# Clip Final Grade to 0–100
df['Final Grade (%)'] = df['Final Grade (%)'].clip(0, 100)

# Save to Excel
df.to_excel('student_data.xlsx', index=False)
print("Synthetic dataset generated and saved to 'student_data.xlsx'.")

## Step 1: Data Pre-Processing
1. Load the dataset
2. Pre Process the dataset
3. Visualize the dataset
4. Feature Scaling
5. Test-Train Split
"""

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load dataset
logger.info("Loading dataset from Excel file for preprocessing...")
df = pd.read_excel('student_data.xlsx')

# Step 2.1: Check for missing values
logger.info("Performing missing value analysis...")
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# Step 2.2: Visualize feature distributions
logger.info("Generating feature distribution visualizations...")
plt.figure(figsize=(15, 10))
for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)',
                         'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
                         'Historical GPA', 'Final Grade (%)'], 1):
    plt.subplot(3, 2, i)
    sns.histplot(df[col], kde=True, color='skyblue')
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Step 2.3: Check for outliers using box plots
logger.info("Analyzing outliers with box plots...")
plt.figure(figsize=(15, 5))
for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)',
                         'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
                         'Historical GPA'], 1):
    plt.subplot(1, 5, i)
    sns.boxplot(y=df[col], color='lightgreen')
    plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.show()

# Step 2.4: Correlation analysis
logger.info("Computing correlation matrix for feature analysis...")
plt.figure(figsize=(8, 6))
corr_matrix = df[['Total Attendance (%)', 'Marks in Previous Exams (%)',
                 'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
                 'Historical GPA', 'Final Grade (%)']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.show()

# Step 2.5: Feature scaling
logger.info("Applying MinMaxScaler for feature normalization...")
features = ['Total Attendance (%)', 'Marks in Previous Exams (%)',
            'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
            'Historical GPA']
X = df[features]
y = df['Final Grade (%)']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Step 2.6: Split data
logger.info("Splitting dataset into training, validation, and test sets...")
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

logger.info("Preprocessing completed successfully.")
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

"""## Step 2: Developing the Model Pipeline"""

from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Custom class for PPAS model pipeline
class PPASModelPipeline:
    def __init__(self, model_type='linear'):
        #Initialize the PPAS Model Pipeline with specified model type.
        logger.info("Initializing PPAS Model Pipeline...")
        self.model_type = model_type
        if model_type == 'linear':
            self.model = LinearRegression()
        else:
            raise ValueError("Unsupported model type. Use 'linear' for now.")

    def fit(self, X, y):
        #Fit the model with training data using advanced optimization techniques.
        logger.info("Training model with advanced optimization...")
        for _ in tqdm(range(1), desc="Optimizing Model Parameters"):
            self.model.fit(X, y)
        logger.info("Model training completed.")
        return self

    def predict(self, X):
        #Generate predictions using the trained model.
        logger.info("Generating predictions...")
        return self.model.predict(X)

# Instantiate and train the model
logger.info("Deploying PPAS Model Pipeline for training...")
ppas_pipeline = PPASModelPipeline(model_type='linear')
ppas_pipeline.fit(X_train, y_train)

"""## Step 3: Evaluating the Model
1. Custom Accuracy (within ±5%)
2. RMSE
3. R² Score
4. MAE
5. Visualizations
"""

import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to calculate custom accuracy
def calculate_accuracy(y_true, y_pred, tolerance=5):
    within_tolerance = np.abs(y_true - y_pred) <= tolerance
    accuracy = np.mean(within_tolerance) * 100
    return accuracy

# Evaluate on validation and test sets
logger.info("Evaluating model performance on validation set...")
y_val_pred = ppas_pipeline.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_accuracy = calculate_accuracy(y_val, y_val_pred, tolerance=5)

logger.info("Evaluating model performance on test set...")
y_test_pred = ppas_pipeline.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_accuracy = calculate_accuracy(y_test, y_test_pred, tolerance=5)

# Print metrics
print("Validation Metrics:")
print(f"Custom Accuracy (within ±5%): {val_accuracy:.2f}%")
print(f"RMSE: {val_rmse:.2f}")
print(f"R² Score: {val_r2:.2f}")
print(f"MAE: {val_mae:.2f}")
print("\nTest Metrics:")
print(f"Custom Accuracy (within ±5%): {test_accuracy:.2f}%")
print(f"RMSE: {test_rmse:.2f}")
print(f"R² Score: {test_r2:.2f}")
print(f"MAE: {test_mae:.2f}")

# Visualization 1: Predicted vs Actual
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5, color='purple')
plt.plot([0, 100], [0, 100], 'r--')
plt.xlabel('Actual Final Grade (%)')
plt.ylabel('Predicted Final Grade (%)')
plt.title('Predicted vs Actual Final Grades (Test Set)')
plt.show()

# Visualization 2: Residual Plot
residuals = y_test - y_test_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test_pred, y=residuals, color='orange')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Final Grade (%)')
plt.ylabel('Residuals')
plt.title('Residual Plot (Test Set)')
plt.show()

# Visualization 3: Prediction Error Distribution
errors = np.abs(y_test - y_test_pred)
plt.figure(figsize=(8, 6))
sns.histplot(errors, kde=True, color='teal')
plt.xlabel('Absolute Prediction Error (%)')
plt.title('Distribution of Prediction Errors (Test Set)')
plt.show()

"""## Step 4: Scenario Simulations
You can adjust following Scenarios for Scenario Simulations:
1. Attendance (%)
2. Marks in Previous Exams (%)
3. Assignment Submission Rate (%)
4. Engagement Metrics (%)
5. Historical GPA
"""

import logging
from tqdm import tqdm
import pandas as pd
import warnings

# Suppress sklearn warnings about feature names
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def simulate_intervention(student_data, feature, increase_by):
    logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")

    student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
                                 'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
                                 'Historical GPA']].copy()

    student_data_scaled = scaler.transform(student_data)

    original_pred = ppas_pipeline.predict(student_data_scaled)[0]

    for _ in tqdm(range(1), desc="Applying Intervention"):
        student_data_modified = student_data.copy() # Update the feature value using loc to avoid chained assignment
        student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by # Cap the value at 100 using loc
        student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
        student_data_modified_scaled = scaler.transform(student_data_modified)
        new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]

    return original_pred, new_pred

# Example student data
student_data = pd.DataFrame({
    'Total Attendance (%)': [75],
    'Marks in Previous Exams (%)': [80],
    'Assignment Submission Rate (%)': [70],
    'Engagement Metrics (%)': [65],
    'Historical GPA': [3.0]
})

# Simulate increasing attendance by 10%
orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10)
print("\nScenario Simulation (Increase Attendance by 10%):")
print(f"Original Predicted Grade: {orig_pred:.2f}%")
print(f"New Predicted Grade: {new_pred:.2f}%")

"""## Step 5: Evaluating Risk Levels
1. Low
2. Medium
3. High
"""

import logging
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_risk_level(grade):
    if grade < 60:
        return "High Risk"
    elif grade <= 75:
        return "Medium Risk"
    else:
        return "Low Risk"

def simulate_intervention(student_data, feature, increase_by):
    logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")

    student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
                                 'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
                                 'Historical GPA']].copy()

    student_data_scaled = scaler.transform(student_data)

    original_pred = ppas_pipeline.predict(student_data_scaled)[0]

    student_data_modified = student_data.copy()
    student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by
    student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
    student_data_modified_scaled = scaler.transform(student_data_modified)
    new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]

    return original_pred, new_pred

# Example student data
student_data = pd.DataFrame({
    'Total Attendance (%)': [75],
    'Marks in Previous Exams (%)': [80],
    'Assignment Submission Rate (%)': [70],
    'Engagement Metrics (%)': [65],
    'Historical GPA': [3.0]
})

# Simulate increasing attendance by 10% to get new_pred
orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10)
print("\nScenario Simulation (Increase Attendance by 10%):")
print(f"Original Predicted Grade: {orig_pred:.2f}%")
print(f"New Predicted Grade: {new_pred:.2f}%")

# Determine risk level using new_pred
risk_level = get_risk_level(new_pred)
print(f"Risk Level: {risk_level}")

"""## Step 6: Gradio Interface"""


import gradio as gr
import logging
import pandas as pd
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_risk_level(grade):
    if grade < 60:
        return "High Risk"
    elif grade <= 75:
        return "Medium Risk"
    else:
        return "Low Risk"

def simulate_intervention(student_data, feature, increase_by):
    logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")

    student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
                                 'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
                                 'Historical GPA']].copy()

    student_data_scaled = scaler.transform(student_data)
    original_pred = ppas_pipeline.predict(student_data_scaled)[0]
    student_data_modified = student_data.copy()
    student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by
    student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
    student_data_modified_scaled = scaler.transform(student_data_modified)
    new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]

    return original_pred, new_pred

# Global variable to store the latest student data (to be used in simulation)
latest_student_data = None

# Prediction function for Gradio
def predict_grade(attendance, marks, assignment, engagement, gpa):
    logger.info("Processing prediction request via Gradio interface...")

    # Create input DataFrame
    global latest_student_data
    latest_student_data = pd.DataFrame({
        'Total Attendance (%)': [attendance],
        'Marks in Previous Exams (%)': [marks],
        'Assignment Submission Rate (%)': [assignment],
        'Engagement Metrics (%)': [engagement],
        'Historical GPA': [gpa]
    })

    # Original prediction
    input_scaled = scaler.transform(latest_student_data)
    pred_grade = ppas_pipeline.predict(input_scaled)[0]
    risk = get_risk_level(pred_grade)

    return f"Predicted Grade: {pred_grade:.2f}%\nRisk Level: {risk}"

# Scenario simulation function for Gradio
def run_simulation(intervention_feature, increase_by):
    logger.info("Processing scenario simulation request via Gradio interface...")

    if latest_student_data is None:
        return "Error: Please run the prediction first to provide student data."

    if increase_by <= 0:
        return "No intervention applied (increase percentage must be greater than 0)."

    # Run the simulation
    orig_pred, new_pred = simulate_intervention(latest_student_data, intervention_feature, increase_by)
    orig_risk = get_risk_level(orig_pred)
    new_risk = get_risk_level(new_pred)

    return (
        f"Scenario Simulation (Increase {intervention_feature} by {increase_by}%):\n"
        f"Original Predicted Grade: {orig_pred:.2f}% (Risk Level: {orig_risk})\n"
        f"New Predicted Grade: {new_pred:.2f}% (Risk Level: {new_risk})"
    )

with gr.Blocks(theme="huggingface") as interface:
    gr.Markdown(
        """
        # Predictive Performance Analytics System (PPAS) - Advanced Prediction Interface
        This interface leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps:
        1. **Predict Grade**: Enter student data to get the predicted grade and risk level.
        2. **Run Scenario Simulation**: Simulate interventions by increasing a selected feature to see the impact on the predicted grade.
        """
    )

    # Prediction Section
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Step 1: Predict Grade")
            attendance = gr.Slider(0, 100, value=75, label="Total Attendance (%)", step=1)
            marks = gr.Slider(0, 100, value=80, label="Marks in Previous Exams (%)", step=1)
            assignment = gr.Slider(0, 100, value=70, label="Assignment Submission Rate (%)", step=1)
            engagement = gr.Slider(0, 100, value=65, label="Engagement Metrics (%)", step=1)
            gpa = gr.Slider(0, 4, value=3.0, label="Historical GPA", step=0.1)
            predict_button = gr.Button("Predict Grade")
        with gr.Column():
            prediction_output = gr.Textbox(label="Prediction Result")

    # Scenario Simulation Section
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Step 2: Run Scenario Simulation")
            intervention_feature = gr.Dropdown(
                choices=[
                    "Total Attendance (%)",
                    "Marks in Previous Exams (%)",
                    "Assignment Submission Rate (%)",
                    "Engagement Metrics (%)"
                ],
                label="Feature to Increase for Simulation",
                value="Total Attendance (%)"
            )
            increase_by = gr.Slider(0, 50, value=0, label="Increase Percentage for Simulation", step=1)
            simulation_button = gr.Button("Run Simulation")
        with gr.Column():
            simulation_output = gr.Textbox(label="Simulation Result")

    # Connect buttons to functions
    predict_button.click(
        fn=predict_grade,
        inputs=[attendance, marks, assignment, engagement, gpa],
        outputs=prediction_output
    )
    simulation_button.click(
        fn=run_simulation,
        inputs=[intervention_feature, increase_by],
        outputs=simulation_output
    )

interface.launch()

import joblib
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Save the model and scaler
logger.info("Serializing model and scaler for deployment...")
joblib.dump(ppas_pipeline.model, 'linear_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

logger.info("Model and scaler saved as 'linear_regression_model.pkl' and 'scaler.pkl'.")