Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""PPAS Model.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1COA86IG7byZ4AtM_kfAj3NY0q0PZ7pLb | |
# **Predictive Performance Analysis for Students** | |
This notebook leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps: | |
Predict Grade: Enter student data to get the predicted grade and risk level. | |
Run Scenario Simulation: Simulate interventions by increasing a selected feature to see the impact on the predicted grade. | |
Step 0: Data Generation | |
import pandas as pd | |
import numpy as np | |
# Set random seed for reproducibility | |
np.random.seed(42) | |
# Generate synthetic dataset | |
n_students = 1000 | |
data = { | |
'Student ID': [f'S{i:03d}' for i in range(1, n_students + 1)], | |
'Student Name': [f'Student {i}' for i in range(1, n_students + 1)], | |
'Total Attendance (%)': np.random.uniform(50, 100, n_students), | |
'Marks in Previous Exams (%)': np.random.uniform(40, 100, n_students), | |
'Assignment Submission Rate (%)': np.random.uniform(50, 100, n_students), | |
'Engagement Metrics (%)': np.random.uniform(50, 100, n_students), | |
'Historical GPA': np.random.uniform(2.0, 4.0, n_students) | |
} | |
# Create DataFrame | |
df = pd.DataFrame(data) | |
# Generate Final Grade as a function of features with noise | |
df['Final Grade (%)'] = ( | |
0.2 * df['Total Attendance (%)'] + | |
0.3 * df['Marks in Previous Exams (%)'] + | |
0.2 * df['Assignment Submission Rate (%)'] + | |
0.2 * df['Engagement Metrics (%)'] + | |
0.1 * (df['Historical GPA'] * 25) + | |
np.random.uniform(-5, 5, n_students) | |
) | |
# Clip Final Grade to 0–100 | |
df['Final Grade (%)'] = df['Final Grade (%)'].clip(0, 100) | |
# Save to Excel | |
df.to_excel('student_data.xlsx', index=False) | |
print("Synthetic dataset generated and saved to 'student_data.xlsx'.") | |
## Step 1: Data Pre-Processing | |
1. Load the dataset | |
2. Pre Process the dataset | |
3. Visualize the dataset | |
4. Feature Scaling | |
5. Test-Train Split | |
""" | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.model_selection import train_test_split | |
import logging | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Load dataset | |
logger.info("Loading dataset from Excel file for preprocessing...") | |
df = pd.read_excel('student_data.xlsx') | |
# Step 2.1: Check for missing values | |
logger.info("Performing missing value analysis...") | |
missing_values = df.isnull().sum() | |
print("Missing Values:\n", missing_values) | |
# Step 2.2: Visualize feature distributions | |
logger.info("Generating feature distribution visualizations...") | |
plt.figure(figsize=(15, 10)) | |
for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA', 'Final Grade (%)'], 1): | |
plt.subplot(3, 2, i) | |
sns.histplot(df[col], kde=True, color='skyblue') | |
plt.title(f'Distribution of {col}') | |
plt.tight_layout() | |
plt.show() | |
# Step 2.3: Check for outliers using box plots | |
logger.info("Analyzing outliers with box plots...") | |
plt.figure(figsize=(15, 5)) | |
for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA'], 1): | |
plt.subplot(1, 5, i) | |
sns.boxplot(y=df[col], color='lightgreen') | |
plt.title(f'Box Plot of {col}') | |
plt.tight_layout() | |
plt.show() | |
# Step 2.4: Correlation analysis | |
logger.info("Computing correlation matrix for feature analysis...") | |
plt.figure(figsize=(8, 6)) | |
corr_matrix = df[['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA', 'Final Grade (%)']].corr() | |
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f') | |
plt.title('Correlation Heatmap of Features') | |
plt.show() | |
# Step 2.5: Feature scaling | |
logger.info("Applying MinMaxScaler for feature normalization...") | |
features = ['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA'] | |
X = df[features] | |
y = df['Final Grade (%)'] | |
scaler = MinMaxScaler() | |
X_scaled = scaler.fit_transform(X) | |
X_scaled = pd.DataFrame(X_scaled, columns=features) | |
# Step 2.6: Split data | |
logger.info("Splitting dataset into training, validation, and test sets...") | |
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42) | |
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42) | |
logger.info("Preprocessing completed successfully.") | |
print(f"Training set size: {X_train.shape[0]}") | |
print(f"Validation set size: {X_val.shape[0]}") | |
print(f"Test set size: {X_test.shape[0]}") | |
"""## Step 2: Developing the Model Pipeline""" | |
from sklearn.linear_model import LinearRegression | |
from tqdm import tqdm | |
import logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Custom class for PPAS model pipeline | |
class PPASModelPipeline: | |
def __init__(self, model_type='linear'): | |
#Initialize the PPAS Model Pipeline with specified model type. | |
logger.info("Initializing PPAS Model Pipeline...") | |
self.model_type = model_type | |
if model_type == 'linear': | |
self.model = LinearRegression() | |
else: | |
raise ValueError("Unsupported model type. Use 'linear' for now.") | |
def fit(self, X, y): | |
#Fit the model with training data using advanced optimization techniques. | |
logger.info("Training model with advanced optimization...") | |
for _ in tqdm(range(1), desc="Optimizing Model Parameters"): | |
self.model.fit(X, y) | |
logger.info("Model training completed.") | |
return self | |
def predict(self, X): | |
#Generate predictions using the trained model. | |
logger.info("Generating predictions...") | |
return self.model.predict(X) | |
# Instantiate and train the model | |
logger.info("Deploying PPAS Model Pipeline for training...") | |
ppas_pipeline = PPASModelPipeline(model_type='linear') | |
ppas_pipeline.fit(X_train, y_train) | |
"""## Step 3: Evaluating the Model | |
1. Custom Accuracy (within ±5%) | |
2. RMSE | |
3. R² Score | |
4. MAE | |
5. Visualizations | |
""" | |
import numpy as np | |
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Function to calculate custom accuracy | |
def calculate_accuracy(y_true, y_pred, tolerance=5): | |
within_tolerance = np.abs(y_true - y_pred) <= tolerance | |
accuracy = np.mean(within_tolerance) * 100 | |
return accuracy | |
# Evaluate on validation and test sets | |
logger.info("Evaluating model performance on validation set...") | |
y_val_pred = ppas_pipeline.predict(X_val) | |
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred)) | |
val_r2 = r2_score(y_val, y_val_pred) | |
val_mae = mean_absolute_error(y_val, y_val_pred) | |
val_accuracy = calculate_accuracy(y_val, y_val_pred, tolerance=5) | |
logger.info("Evaluating model performance on test set...") | |
y_test_pred = ppas_pipeline.predict(X_test) | |
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred)) | |
test_r2 = r2_score(y_test, y_test_pred) | |
test_mae = mean_absolute_error(y_test, y_test_pred) | |
test_accuracy = calculate_accuracy(y_test, y_test_pred, tolerance=5) | |
# Print metrics | |
print("Validation Metrics:") | |
print(f"Custom Accuracy (within ±5%): {val_accuracy:.2f}%") | |
print(f"RMSE: {val_rmse:.2f}") | |
print(f"R² Score: {val_r2:.2f}") | |
print(f"MAE: {val_mae:.2f}") | |
print("\nTest Metrics:") | |
print(f"Custom Accuracy (within ±5%): {test_accuracy:.2f}%") | |
print(f"RMSE: {test_rmse:.2f}") | |
print(f"R² Score: {test_r2:.2f}") | |
print(f"MAE: {test_mae:.2f}") | |
# Visualization 1: Predicted vs Actual | |
plt.figure(figsize=(8, 6)) | |
plt.scatter(y_test, y_test_pred, alpha=0.5, color='purple') | |
plt.plot([0, 100], [0, 100], 'r--') | |
plt.xlabel('Actual Final Grade (%)') | |
plt.ylabel('Predicted Final Grade (%)') | |
plt.title('Predicted vs Actual Final Grades (Test Set)') | |
plt.show() | |
# Visualization 2: Residual Plot | |
residuals = y_test - y_test_pred | |
plt.figure(figsize=(8, 6)) | |
sns.scatterplot(x=y_test_pred, y=residuals, color='orange') | |
plt.axhline(0, color='red', linestyle='--') | |
plt.xlabel('Predicted Final Grade (%)') | |
plt.ylabel('Residuals') | |
plt.title('Residual Plot (Test Set)') | |
plt.show() | |
# Visualization 3: Prediction Error Distribution | |
errors = np.abs(y_test - y_test_pred) | |
plt.figure(figsize=(8, 6)) | |
sns.histplot(errors, kde=True, color='teal') | |
plt.xlabel('Absolute Prediction Error (%)') | |
plt.title('Distribution of Prediction Errors (Test Set)') | |
plt.show() | |
"""## Step 4: Scenario Simulations | |
You can adjust following Scenarios for Scenario Simulations: | |
1. Attendance (%) | |
2. Marks in Previous Exams (%) | |
3. Assignment Submission Rate (%) | |
4. Engagement Metrics (%) | |
5. Historical GPA | |
""" | |
import logging | |
from tqdm import tqdm | |
import pandas as pd | |
import warnings | |
# Suppress sklearn warnings about feature names | |
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def simulate_intervention(student_data, feature, increase_by): | |
logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...") | |
student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA']].copy() | |
student_data_scaled = scaler.transform(student_data) | |
original_pred = ppas_pipeline.predict(student_data_scaled)[0] | |
for _ in tqdm(range(1), desc="Applying Intervention"): | |
student_data_modified = student_data.copy() # Update the feature value using loc to avoid chained assignment | |
student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by # Cap the value at 100 using loc | |
student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100) | |
student_data_modified_scaled = scaler.transform(student_data_modified) | |
new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0] | |
return original_pred, new_pred | |
# Example student data | |
student_data = pd.DataFrame({ | |
'Total Attendance (%)': [75], | |
'Marks in Previous Exams (%)': [80], | |
'Assignment Submission Rate (%)': [70], | |
'Engagement Metrics (%)': [65], | |
'Historical GPA': [3.0] | |
}) | |
# Simulate increasing attendance by 10% | |
orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10) | |
print("\nScenario Simulation (Increase Attendance by 10%):") | |
print(f"Original Predicted Grade: {orig_pred:.2f}%") | |
print(f"New Predicted Grade: {new_pred:.2f}%") | |
"""## Step 5: Evaluating Risk Levels | |
1. Low | |
2. Medium | |
3. High | |
""" | |
import logging | |
import pandas as pd | |
import warnings | |
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def get_risk_level(grade): | |
if grade < 60: | |
return "High Risk" | |
elif grade <= 75: | |
return "Medium Risk" | |
else: | |
return "Low Risk" | |
def simulate_intervention(student_data, feature, increase_by): | |
logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...") | |
student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA']].copy() | |
student_data_scaled = scaler.transform(student_data) | |
original_pred = ppas_pipeline.predict(student_data_scaled)[0] | |
student_data_modified = student_data.copy() | |
student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by | |
student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100) | |
student_data_modified_scaled = scaler.transform(student_data_modified) | |
new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0] | |
return original_pred, new_pred | |
# Example student data | |
student_data = pd.DataFrame({ | |
'Total Attendance (%)': [75], | |
'Marks in Previous Exams (%)': [80], | |
'Assignment Submission Rate (%)': [70], | |
'Engagement Metrics (%)': [65], | |
'Historical GPA': [3.0] | |
}) | |
# Simulate increasing attendance by 10% to get new_pred | |
orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10) | |
print("\nScenario Simulation (Increase Attendance by 10%):") | |
print(f"Original Predicted Grade: {orig_pred:.2f}%") | |
print(f"New Predicted Grade: {new_pred:.2f}%") | |
# Determine risk level using new_pred | |
risk_level = get_risk_level(new_pred) | |
print(f"Risk Level: {risk_level}") | |
"""## Step 6: Gradio Interface""" | |
import gradio as gr | |
import logging | |
import pandas as pd | |
import warnings | |
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn") | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
def get_risk_level(grade): | |
if grade < 60: | |
return "High Risk" | |
elif grade <= 75: | |
return "Medium Risk" | |
else: | |
return "Low Risk" | |
def simulate_intervention(student_data, feature, increase_by): | |
logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...") | |
student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)', | |
'Assignment Submission Rate (%)', 'Engagement Metrics (%)', | |
'Historical GPA']].copy() | |
student_data_scaled = scaler.transform(student_data) | |
original_pred = ppas_pipeline.predict(student_data_scaled)[0] | |
student_data_modified = student_data.copy() | |
student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by | |
student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100) | |
student_data_modified_scaled = scaler.transform(student_data_modified) | |
new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0] | |
return original_pred, new_pred | |
# Global variable to store the latest student data (to be used in simulation) | |
latest_student_data = None | |
# Prediction function for Gradio | |
def predict_grade(attendance, marks, assignment, engagement, gpa): | |
logger.info("Processing prediction request via Gradio interface...") | |
# Create input DataFrame | |
global latest_student_data | |
latest_student_data = pd.DataFrame({ | |
'Total Attendance (%)': [attendance], | |
'Marks in Previous Exams (%)': [marks], | |
'Assignment Submission Rate (%)': [assignment], | |
'Engagement Metrics (%)': [engagement], | |
'Historical GPA': [gpa] | |
}) | |
# Original prediction | |
input_scaled = scaler.transform(latest_student_data) | |
pred_grade = ppas_pipeline.predict(input_scaled)[0] | |
risk = get_risk_level(pred_grade) | |
return f"Predicted Grade: {pred_grade:.2f}%\nRisk Level: {risk}" | |
# Scenario simulation function for Gradio | |
def run_simulation(intervention_feature, increase_by): | |
logger.info("Processing scenario simulation request via Gradio interface...") | |
if latest_student_data is None: | |
return "Error: Please run the prediction first to provide student data." | |
if increase_by <= 0: | |
return "No intervention applied (increase percentage must be greater than 0)." | |
# Run the simulation | |
orig_pred, new_pred = simulate_intervention(latest_student_data, intervention_feature, increase_by) | |
orig_risk = get_risk_level(orig_pred) | |
new_risk = get_risk_level(new_pred) | |
return ( | |
f"Scenario Simulation (Increase {intervention_feature} by {increase_by}%):\n" | |
f"Original Predicted Grade: {orig_pred:.2f}% (Risk Level: {orig_risk})\n" | |
f"New Predicted Grade: {new_pred:.2f}% (Risk Level: {new_risk})" | |
) | |
with gr.Blocks(theme="huggingface") as interface: | |
gr.Markdown( | |
""" | |
# Predictive Performance Analytics System (PPAS) - Advanced Prediction Interface | |
This interface leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps: | |
1. **Predict Grade**: Enter student data to get the predicted grade and risk level. | |
2. **Run Scenario Simulation**: Simulate interventions by increasing a selected feature to see the impact on the predicted grade. | |
""" | |
) | |
# Prediction Section | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Step 1: Predict Grade") | |
attendance = gr.Slider(0, 100, value=75, label="Total Attendance (%)", step=1) | |
marks = gr.Slider(0, 100, value=80, label="Marks in Previous Exams (%)", step=1) | |
assignment = gr.Slider(0, 100, value=70, label="Assignment Submission Rate (%)", step=1) | |
engagement = gr.Slider(0, 100, value=65, label="Engagement Metrics (%)", step=1) | |
gpa = gr.Slider(0, 4, value=3.0, label="Historical GPA", step=0.1) | |
predict_button = gr.Button("Predict Grade") | |
with gr.Column(): | |
prediction_output = gr.Textbox(label="Prediction Result") | |
# Scenario Simulation Section | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Step 2: Run Scenario Simulation") | |
intervention_feature = gr.Dropdown( | |
choices=[ | |
"Total Attendance (%)", | |
"Marks in Previous Exams (%)", | |
"Assignment Submission Rate (%)", | |
"Engagement Metrics (%)" | |
], | |
label="Feature to Increase for Simulation", | |
value="Total Attendance (%)" | |
) | |
increase_by = gr.Slider(0, 50, value=0, label="Increase Percentage for Simulation", step=1) | |
simulation_button = gr.Button("Run Simulation") | |
with gr.Column(): | |
simulation_output = gr.Textbox(label="Simulation Result") | |
# Connect buttons to functions | |
predict_button.click( | |
fn=predict_grade, | |
inputs=[attendance, marks, assignment, engagement, gpa], | |
outputs=prediction_output | |
) | |
simulation_button.click( | |
fn=run_simulation, | |
inputs=[intervention_feature, increase_by], | |
outputs=simulation_output | |
) | |
interface.launch() | |
import joblib | |
import logging | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Save the model and scaler | |
logger.info("Serializing model and scaler for deployment...") | |
joblib.dump(ppas_pipeline.model, 'linear_regression_model.pkl') | |
joblib.dump(scaler, 'scaler.pkl') | |
logger.info("Model and scaler saved as 'linear_regression_model.pkl' and 'scaler.pkl'.") | |