Spaces:

huzaifa113
/

PPAS

Sleeping

App Files Files Community

PPAS / app.py

huzaifa113

Update app.py

1b7952b verified about 1 month ago

raw

history blame contribute delete

19.4 kB

	# -- coding: utf-8 --
	"""PPAS Model.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1COA86IG7byZ4AtM_kfAj3NY0q0PZ7pLb

	# Predictive Performance Analysis for Students
	This notebook leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps:

	Predict Grade: Enter student data to get the predicted grade and risk level.
	Run Scenario Simulation: Simulate interventions by increasing a selected feature to see the impact on the predicted grade.

	Step 0: Data Generation

	import pandas as pd
	import numpy as np

	# Set random seed for reproducibility
	np.random.seed(42)

	# Generate synthetic dataset
	n_students = 1000
	data = {
	'Student ID': [f'S{i:03d}' for i in range(1, n_students + 1)],
	'Student Name': [f'Student {i}' for i in range(1, n_students + 1)],
	'Total Attendance (%)': np.random.uniform(50, 100, n_students),
	'Marks in Previous Exams (%)': np.random.uniform(40, 100, n_students),
	'Assignment Submission Rate (%)': np.random.uniform(50, 100, n_students),
	'Engagement Metrics (%)': np.random.uniform(50, 100, n_students),
	'Historical GPA': np.random.uniform(2.0, 4.0, n_students)
	}

	# Create DataFrame
	df = pd.DataFrame(data)

	# Generate Final Grade as a function of features with noise
	df['Final Grade (%)'] = (
	0.2 * df['Total Attendance (%)'] +
	0.3 * df['Marks in Previous Exams (%)'] +
	0.2 * df['Assignment Submission Rate (%)'] +
	0.2 * df['Engagement Metrics (%)'] +
	0.1 * (df['Historical GPA'] * 25) +
	np.random.uniform(-5, 5, n_students)
	)

	# Clip Final Grade to 0–100
	df['Final Grade (%)'] = df['Final Grade (%)'].clip(0, 100)

	# Save to Excel
	df.to_excel('student_data.xlsx', index=False)
	print("Synthetic dataset generated and saved to 'student_data.xlsx'.")

	## Step 1: Data Pre-Processing
	1. Load the dataset
	2. Pre Process the dataset
	3. Visualize the dataset
	4. Feature Scaling
	5. Test-Train Split
	"""

	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.model_selection import train_test_split
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Load dataset
	logger.info("Loading dataset from Excel file for preprocessing...")
	df = pd.read_excel('student_data.xlsx')

	# Step 2.1: Check for missing values
	logger.info("Performing missing value analysis...")
	missing_values = df.isnull().sum()
	print("Missing Values:\n", missing_values)

	# Step 2.2: Visualize feature distributions
	logger.info("Generating feature distribution visualizations...")
	plt.figure(figsize=(15, 10))
	for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA', 'Final Grade (%)'], 1):
	plt.subplot(3, 2, i)
	sns.histplot(df[col], kde=True, color='skyblue')
	plt.title(f'Distribution of {col}')
	plt.tight_layout()
	plt.show()

	# Step 2.3: Check for outliers using box plots
	logger.info("Analyzing outliers with box plots...")
	plt.figure(figsize=(15, 5))
	for i, col in enumerate(['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA'], 1):
	plt.subplot(1, 5, i)
	sns.boxplot(y=df[col], color='lightgreen')
	plt.title(f'Box Plot of {col}')
	plt.tight_layout()
	plt.show()

	# Step 2.4: Correlation analysis
	logger.info("Computing correlation matrix for feature analysis...")
	plt.figure(figsize=(8, 6))
	corr_matrix = df[['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA', 'Final Grade (%)']].corr()
	sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
	plt.title('Correlation Heatmap of Features')
	plt.show()

	# Step 2.5: Feature scaling
	logger.info("Applying MinMaxScaler for feature normalization...")
	features = ['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA']
	X = df[features]
	y = df['Final Grade (%)']
	scaler = MinMaxScaler()
	X_scaled = scaler.fit_transform(X)
	X_scaled = pd.DataFrame(X_scaled, columns=features)

	# Step 2.6: Split data
	logger.info("Splitting dataset into training, validation, and test sets...")
	X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
	X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

	logger.info("Preprocessing completed successfully.")
	print(f"Training set size: {X_train.shape[0]}")
	print(f"Validation set size: {X_val.shape[0]}")
	print(f"Test set size: {X_test.shape[0]}")

	"""## Step 2: Developing the Model Pipeline"""

	from sklearn.linear_model import LinearRegression
	from tqdm import tqdm
	import logging

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Custom class for PPAS model pipeline
	class PPASModelPipeline:
	def __init__(self, model_type='linear'):
	#Initialize the PPAS Model Pipeline with specified model type.
	logger.info("Initializing PPAS Model Pipeline...")
	self.model_type = model_type
	if model_type == 'linear':
	self.model = LinearRegression()
	else:
	raise ValueError("Unsupported model type. Use 'linear' for now.")

	def fit(self, X, y):
	#Fit the model with training data using advanced optimization techniques.
	logger.info("Training model with advanced optimization...")
	for _ in tqdm(range(1), desc="Optimizing Model Parameters"):
	self.model.fit(X, y)
	logger.info("Model training completed.")
	return self

	def predict(self, X):
	#Generate predictions using the trained model.
	logger.info("Generating predictions...")
	return self.model.predict(X)

	# Instantiate and train the model
	logger.info("Deploying PPAS Model Pipeline for training...")
	ppas_pipeline = PPASModelPipeline(model_type='linear')
	ppas_pipeline.fit(X_train, y_train)

	"""## Step 3: Evaluating the Model
	1. Custom Accuracy (within ±5%)
	2. RMSE
	3. R² Score
	4. MAE
	5. Visualizations
	"""

	import numpy as np
	from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
	import seaborn as sns
	import matplotlib.pyplot as plt
	import logging

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Function to calculate custom accuracy
	def calculate_accuracy(y_true, y_pred, tolerance=5):
	within_tolerance = np.abs(y_true - y_pred) <= tolerance
	accuracy = np.mean(within_tolerance) * 100
	return accuracy

	# Evaluate on validation and test sets
	logger.info("Evaluating model performance on validation set...")
	y_val_pred = ppas_pipeline.predict(X_val)
	val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
	val_r2 = r2_score(y_val, y_val_pred)
	val_mae = mean_absolute_error(y_val, y_val_pred)
	val_accuracy = calculate_accuracy(y_val, y_val_pred, tolerance=5)

	logger.info("Evaluating model performance on test set...")
	y_test_pred = ppas_pipeline.predict(X_test)
	test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
	test_r2 = r2_score(y_test, y_test_pred)
	test_mae = mean_absolute_error(y_test, y_test_pred)
	test_accuracy = calculate_accuracy(y_test, y_test_pred, tolerance=5)

	# Print metrics
	print("Validation Metrics:")
	print(f"Custom Accuracy (within ±5%): {val_accuracy:.2f}%")
	print(f"RMSE: {val_rmse:.2f}")
	print(f"R² Score: {val_r2:.2f}")
	print(f"MAE: {val_mae:.2f}")
	print("\nTest Metrics:")
	print(f"Custom Accuracy (within ±5%): {test_accuracy:.2f}%")
	print(f"RMSE: {test_rmse:.2f}")
	print(f"R² Score: {test_r2:.2f}")
	print(f"MAE: {test_mae:.2f}")

	# Visualization 1: Predicted vs Actual
	plt.figure(figsize=(8, 6))
	plt.scatter(y_test, y_test_pred, alpha=0.5, color='purple')
	plt.plot([0, 100], [0, 100], 'r--')
	plt.xlabel('Actual Final Grade (%)')
	plt.ylabel('Predicted Final Grade (%)')
	plt.title('Predicted vs Actual Final Grades (Test Set)')
	plt.show()

	# Visualization 2: Residual Plot
	residuals = y_test - y_test_pred
	plt.figure(figsize=(8, 6))
	sns.scatterplot(x=y_test_pred, y=residuals, color='orange')
	plt.axhline(0, color='red', linestyle='--')
	plt.xlabel('Predicted Final Grade (%)')
	plt.ylabel('Residuals')
	plt.title('Residual Plot (Test Set)')
	plt.show()

	# Visualization 3: Prediction Error Distribution
	errors = np.abs(y_test - y_test_pred)
	plt.figure(figsize=(8, 6))
	sns.histplot(errors, kde=True, color='teal')
	plt.xlabel('Absolute Prediction Error (%)')
	plt.title('Distribution of Prediction Errors (Test Set)')
	plt.show()

	"""## Step 4: Scenario Simulations
	You can adjust following Scenarios for Scenario Simulations:
	1. Attendance (%)
	2. Marks in Previous Exams (%)
	3. Assignment Submission Rate (%)
	4. Engagement Metrics (%)
	5. Historical GPA
	"""

	import logging
	from tqdm import tqdm
	import pandas as pd
	import warnings

	# Suppress sklearn warnings about feature names
	warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	def simulate_intervention(student_data, feature, increase_by):
	logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")

	student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA']].copy()

	student_data_scaled = scaler.transform(student_data)

	original_pred = ppas_pipeline.predict(student_data_scaled)[0]

	for _ in tqdm(range(1), desc="Applying Intervention"):
	student_data_modified = student_data.copy() # Update the feature value using loc to avoid chained assignment
	student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by # Cap the value at 100 using loc
	student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
	student_data_modified_scaled = scaler.transform(student_data_modified)
	new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]

	return original_pred, new_pred

	# Example student data
	student_data = pd.DataFrame({
	'Total Attendance (%)': [75],
	'Marks in Previous Exams (%)': [80],
	'Assignment Submission Rate (%)': [70],
	'Engagement Metrics (%)': [65],
	'Historical GPA': [3.0]
	})

	# Simulate increasing attendance by 10%
	orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10)
	print("\nScenario Simulation (Increase Attendance by 10%):")
	print(f"Original Predicted Grade: {orig_pred:.2f}%")
	print(f"New Predicted Grade: {new_pred:.2f}%")

	"""## Step 5: Evaluating Risk Levels
	1. Low
	2. Medium
	3. High
	"""

	import logging
	import pandas as pd
	import warnings

	warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	def get_risk_level(grade):
	if grade < 60:
	return "High Risk"
	elif grade <= 75:
	return "Medium Risk"
	else:
	return "Low Risk"

	def simulate_intervention(student_data, feature, increase_by):
	logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")

	student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA']].copy()

	student_data_scaled = scaler.transform(student_data)

	original_pred = ppas_pipeline.predict(student_data_scaled)[0]

	student_data_modified = student_data.copy()
	student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by
	student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
	student_data_modified_scaled = scaler.transform(student_data_modified)
	new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]

	return original_pred, new_pred

	# Example student data
	student_data = pd.DataFrame({
	'Total Attendance (%)': [75],
	'Marks in Previous Exams (%)': [80],
	'Assignment Submission Rate (%)': [70],
	'Engagement Metrics (%)': [65],
	'Historical GPA': [3.0]
	})

	# Simulate increasing attendance by 10% to get new_pred
	orig_pred, new_pred = simulate_intervention(student_data, 'Total Attendance (%)', 10)
	print("\nScenario Simulation (Increase Attendance by 10%):")
	print(f"Original Predicted Grade: {orig_pred:.2f}%")
	print(f"New Predicted Grade: {new_pred:.2f}%")

	# Determine risk level using new_pred
	risk_level = get_risk_level(new_pred)
	print(f"Risk Level: {risk_level}")

	"""## Step 6: Gradio Interface"""



	import gradio as gr
	import logging
	import pandas as pd
	import warnings

	warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	def get_risk_level(grade):
	if grade < 60:
	return "High Risk"
	elif grade <= 75:
	return "Medium Risk"
	else:
	return "Low Risk"

	def simulate_intervention(student_data, feature, increase_by):
	logger.info(f"Simulating intervention: Increasing {feature} by {increase_by}%...")

	student_data = student_data[['Total Attendance (%)', 'Marks in Previous Exams (%)',
	'Assignment Submission Rate (%)', 'Engagement Metrics (%)',
	'Historical GPA']].copy()

	student_data_scaled = scaler.transform(student_data)
	original_pred = ppas_pipeline.predict(student_data_scaled)[0]
	student_data_modified = student_data.copy()
	student_data_modified.loc[0, feature] = student_data_modified.loc[0, feature] + increase_by
	student_data_modified.loc[0, feature] = min(student_data_modified.loc[0, feature], 100)
	student_data_modified_scaled = scaler.transform(student_data_modified)
	new_pred = ppas_pipeline.predict(student_data_modified_scaled)[0]

	return original_pred, new_pred

	# Global variable to store the latest student data (to be used in simulation)
	latest_student_data = None

	# Prediction function for Gradio
	def predict_grade(attendance, marks, assignment, engagement, gpa):
	logger.info("Processing prediction request via Gradio interface...")

	# Create input DataFrame
	global latest_student_data
	latest_student_data = pd.DataFrame({
	'Total Attendance (%)': [attendance],
	'Marks in Previous Exams (%)': [marks],
	'Assignment Submission Rate (%)': [assignment],
	'Engagement Metrics (%)': [engagement],
	'Historical GPA': [gpa]
	})

	# Original prediction
	input_scaled = scaler.transform(latest_student_data)
	pred_grade = ppas_pipeline.predict(input_scaled)[0]
	risk = get_risk_level(pred_grade)

	return f"Predicted Grade: {pred_grade:.2f}%\nRisk Level: {risk}"

	# Scenario simulation function for Gradio
	def run_simulation(intervention_feature, increase_by):
	logger.info("Processing scenario simulation request via Gradio interface...")

	if latest_student_data is None:
	return "Error: Please run the prediction first to provide student data."

	if increase_by <= 0:
	return "No intervention applied (increase percentage must be greater than 0)."

	# Run the simulation
	orig_pred, new_pred = simulate_intervention(latest_student_data, intervention_feature, increase_by)
	orig_risk = get_risk_level(orig_pred)
	new_risk = get_risk_level(new_pred)

	return (
	f"Scenario Simulation (Increase {intervention_feature} by {increase_by}%):\n"
	f"Original Predicted Grade: {orig_pred:.2f}% (Risk Level: {orig_risk})\n"
	f"New Predicted Grade: {new_pred:.2f}% (Risk Level: {new_risk})"
	)

	with gr.Blocks(theme="huggingface") as interface:
	gr.Markdown(
	"""
	# Predictive Performance Analytics System (PPAS) - Advanced Prediction Interface
	This interface leverages a state-of-the-art machine learning pipeline to predict student performance and assess risk levels. Follow these steps:
	1. Predict Grade: Enter student data to get the predicted grade and risk level.
	2. Run Scenario Simulation: Simulate interventions by increasing a selected feature to see the impact on the predicted grade.
	"""
	)

	# Prediction Section
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Step 1: Predict Grade")
	attendance = gr.Slider(0, 100, value=75, label="Total Attendance (%)", step=1)
	marks = gr.Slider(0, 100, value=80, label="Marks in Previous Exams (%)", step=1)
	assignment = gr.Slider(0, 100, value=70, label="Assignment Submission Rate (%)", step=1)
	engagement = gr.Slider(0, 100, value=65, label="Engagement Metrics (%)", step=1)
	gpa = gr.Slider(0, 4, value=3.0, label="Historical GPA", step=0.1)
	predict_button = gr.Button("Predict Grade")
	with gr.Column():
	prediction_output = gr.Textbox(label="Prediction Result")

	# Scenario Simulation Section
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Step 2: Run Scenario Simulation")
	intervention_feature = gr.Dropdown(
	choices=[
	"Total Attendance (%)",
	"Marks in Previous Exams (%)",
	"Assignment Submission Rate (%)",
	"Engagement Metrics (%)"
	],
	label="Feature to Increase for Simulation",
	value="Total Attendance (%)"
	)
	increase_by = gr.Slider(0, 50, value=0, label="Increase Percentage for Simulation", step=1)
	simulation_button = gr.Button("Run Simulation")
	with gr.Column():
	simulation_output = gr.Textbox(label="Simulation Result")

	# Connect buttons to functions
	predict_button.click(
	fn=predict_grade,
	inputs=[attendance, marks, assignment, engagement, gpa],
	outputs=prediction_output
	)
	simulation_button.click(
	fn=run_simulation,
	inputs=[intervention_feature, increase_by],
	outputs=simulation_output
	)

	interface.launch()

	import joblib
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Save the model and scaler
	logger.info("Serializing model and scaler for deployment...")
	joblib.dump(ppas_pipeline.model, 'linear_regression_model.pkl')
	joblib.dump(scaler, 'scaler.pkl')

	logger.info("Model and scaler saved as 'linear_regression_model.pkl' and 'scaler.pkl'.")