Spaces:
Sleeping
Sleeping
import streamlit as st | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import math | |
from PIL import Image | |
from sklearn.model_selection import train_test_split,cross_validate | |
from sklearn.preprocessing import RobustScaler, OneHotEncoder,PowerTransformer,StandardScaler | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.metrics import mean_squared_error,r2_score | |
from sklearn.neighbors import KNeighborsRegressor | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.linear_model import SGDRegressor,RidgeCV,LassoCV | |
from sklearn.preprocessing import PolynomialFeatures,FunctionTransformer | |
from sklearn.ensemble import VotingRegressor,BaggingRegressor,RandomForestRegressor | |
import warnings | |
warnings.filterwarnings('ignore') | |
data=pd.read_csv("weatherAUS.csv") | |
df=data.copy() | |
# Set page configuration | |
st.set_page_config(page_title="ML Pipeline", page_icon="⚡", layout="centered") | |
st.markdown( | |
""" | |
<style> | |
/* Set background color */ | |
.stApp { | |
background-color: #015551; | |
color: white; | |
} | |
/* Adjust image size */ | |
.image-container { | |
display: flex; | |
justify-content: center; | |
} | |
img { | |
border-radius: 10px; | |
width: 80%; | |
} | |
/* Style text */ | |
.title { | |
text-align: center; | |
font-size: 28px; | |
font-weight: bold; | |
} | |
.subtitle { | |
text-align: center; | |
font-size: 22px; | |
font-weight: bold; | |
} | |
.content { | |
text-align: justify; | |
margin-left: auto; | |
margin-right: auto; | |
max-width: 80%; | |
} | |
""", | |
unsafe_allow_html=True | |
) | |
# Initialize session state for navigation | |
if "page" not in st.session_state: | |
st.session_state.page = "main" | |
# Function to navigate between pages | |
def navigate_to(page_name): | |
st.session_state.page = page_name | |
# Main Page Navigation | |
if st.session_state.page == "main": | |
st.markdown("<h1 style='text-align: center; color: #4CAF50;'>Machine Learning Pipeline</h1>", unsafe_allow_html=True) | |
# Instructions | |
st.write("Click on any step to view details.") | |
# Define pipeline steps | |
steps = [ | |
"Problem Statement", "Data Collection", "Simple EDA", "Data Preprocessing", | |
"Advanced EDA", "Model Building", "Model Testing", "Model Deployment" | |
] | |
# Layout: Two rows with 4 buttons each | |
cols = st.columns(4) | |
# Button Click Logic | |
for i, step in enumerate(steps): | |
with cols[i % 4]: | |
if st.button(step, key=step): | |
navigate_to(step.replace(" ", "_").lower()) # Navigate to the selected step | |
elif st.session_state.page == 'problem_statement': | |
col1, col2 = st.columns([1, 5]) | |
with col1: | |
st.image("https://cdn-icons-png.flaticon.com/512/1146/1146869.png", width=100, caption="") | |
with col2: | |
st.markdown( | |
"<h1 style='color: #38B6FF; padding-top: 20px;'>Rain Prediction Problem Statement</h1>", | |
unsafe_allow_html=True | |
) | |
# 📌 Context (inside expander) | |
with st.expander("📌 What’s the Problem?"): | |
st.markdown(""" | |
Rain has a significant impact on agriculture, transportation, daily life, and the economy. | |
Yet predicting whether it will rain **tomorrow** remains a challenge. | |
Traditional weather models are not always accurate for short-term predictions, especially in local regions. | |
Our goal is to use **machine learning** to predict rainfall using today’s observed weather features. | |
""") | |
# 🎯 Objective | |
st.markdown("### 🎯 Our Goal") | |
st.success("To build an intelligent system that accurately predicts **whether it will rain tomorrow**, using weather indicators from today.") | |
# 🧠 Dataset Summary in 2-column layout | |
st.markdown("### 📂 Dataset Highlights") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.metric(label="🌡️ Temperature Fields", value="4 types") | |
st.metric(label="💧 Humidity & Rainfall", value="3 features") | |
st.metric(label="☁️ Cloud & Sunshine", value="3 features") | |
with col2: | |
st.metric(label="🌬️ Wind Features", value="4 values") | |
st.metric(label="🧭 Pressure", value="2 features") | |
st.metric(label="🟰 Target", value="RainTomorrow") | |
# Interactive card for impact | |
st.markdown("### 🌍 Real-World Impact") | |
with st.container(): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.info("👨🌾 **Farmers** can make informed decisions on irrigation and harvest.") | |
st.info("🚗 **Commuters** can plan travel during uncertain weather.") | |
with col2: | |
st.info("📦 **Logistics** can prepare for potential rain disruptions.") | |
st.info("🏛️ **Government bodies** can alert regions vulnerable to floods.") | |
# Optional animation or visual | |
#st.image("https://cdn-icons-png.flaticon.com/512/1146/1146869.png", width=100, caption="Smart Weather Forecasting") | |
# CTA | |
st.markdown("---") | |
st.markdown("#### ✅ Ready to try the prediction?") | |
st.markdown("Click below to head to the app and test it in real time!") | |
if st.button("🔮 Go to Rain Predictor"): | |
st.switch_page("Model.py") | |
st.write("---") | |
st.write("### 🔍 **What’s Next?**") | |
st.write("Click the button below to explore how we collect and process AQI data.") | |
if st.button("➡️ Go to Data Collection"): | |
st.session_state.page = "data_collection" | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
# **Only execute the content when the selected page is "data_collection"** | |
elif st.session_state.page == "data_collection": | |
# Header: Icon + Title | |
col1, col2 = st.columns([1, 5]) | |
with col1: | |
st.image("https://cdn-icons-png.flaticon.com/512/2460/2460591.png", width=80) | |
with col2: | |
st.markdown("<h1 style='color: #00B4D8;'>📊 Data Collection</h1>", unsafe_allow_html=True) | |
st.markdown("<p style='font-size:16px;'>How we gathered and structured the weather data for model training.</p>", unsafe_allow_html=True) | |
st.markdown("---") | |
# Section 1: Data Source | |
st.markdown("### 🌐 Data Sources") | |
st.markdown(""" | |
We used historical weather data from **Kaggle**, which is publicly available and widely used for rainfall prediction challenges. | |
✅ Open-source | |
✅ Includes daily weather observations | |
✅ Covers multiple cities and years | |
""") | |
# Section 2: Features Overview | |
with st.expander("🔍 View Collected Features"): | |
st.markdown(""" | |
- `Date` | |
- `Location` | |
- `MinTemp`, `MaxTemp`, `Temp9am`, `Temp3pm` | |
- `Rainfall`, `Evaporation`, `Sunshine` | |
- `WindGustDir`, `WindDir9am`, `WindDir3pm` | |
- `WindGustSpeed`, `WindSpeed9am`, `WindSpeed3pm` | |
- `Humidity9am`, `Humidity3pm` | |
- `Pressure9am`, `Pressure3pm` | |
- `Cloud9am`, `Cloud3pm` | |
- `RainToday` (Yes/No) → 🧠 Used to predict `RainTomorrow` | |
""") | |
# Section 3: Visual Timeline of Collection | |
st.markdown("### 🕒 Collection Timeline & Scope") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.success("📍 Locations: 49 Australian cities") | |
st.info("📆 Date Range: 2007 - 2017") | |
st.warning("🔴 Missing values handled before training") | |
with col2: | |
st.image("https://cdn-icons-png.flaticon.com/512/3222/3222800.png", width=150) | |
# CTA | |
st.markdown("---") | |
# Footer | |
st.markdown("<hr style='border: 0.5px solid gray;'>", unsafe_allow_html=True) | |
st.markdown("<p style='text-align: center; color: black;'>• Rain Prediction App •", unsafe_allow_html=True) | |
# Call-to-Action | |
if st.button("➡️ Go to have a look on Quality of data"): | |
navigate_to("simple_eda") | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
# **Other Pages Should Not Display Data Collection Content** | |
elif st.session_state.page == "simple_eda": | |
with st.expander("📄 Preview Dataset"): | |
st.dataframe(df.head()) | |
# Overview | |
st.markdown("### 🧾 Dataset Summary") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("**Shape:**", df.shape) | |
st.write("**Columns:**", df.columns.tolist()) | |
st.dataframe(df.dtypes) | |
with col2: | |
st.write("**Missing Values (%):**") | |
st.dataframe((df.isnull().mean() * 100).round(2)) | |
st.markdown("#### ✅ Next Step: Ready to clean and prepare the data?") | |
if st.button("🧹 Go to Data Cleaning"): | |
navigate_to("data_preprocessing") | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
elif st.session_state.page == "data_preprocessing": | |
col1, col2 = st.columns([1, 5]) | |
with col1: | |
st.image("https://cdn-icons-png.flaticon.com/512/3242/3242257.png", width=80) | |
with col2: | |
st.markdown("<h1 style='color: #00C897;'>🧹 Data Cleaning</h1>", unsafe_allow_html=True) | |
st.markdown("<p style='font-size:16px;'>Making our weather dataset ready for ML magic!</p>", unsafe_allow_html=True) | |
st.markdown("---") | |
# Step 1: Describe cleaning workflow | |
st.markdown("### 🧼 Cleaning Workflow") | |
st.write("🛠️ Step-by-step Cleaning Process") | |
st.markdown(""" | |
1. **Missing Value Handling** | |
- Dropped rows/columns with excessive missing values | |
- Used mean/median imputation for numeric columns | |
- Used mode or 'Unknown' for categorical columns | |
2. **Categorical Encoding** | |
- One-hot encoded wind directions (`WindGustDir`, `WindDir9am`, `WindDir3pm`) | |
- Binary encoding for `RainToday` | |
3. **Scaling** | |
- Used `RobustScaler` to reduce the impact of outliers | |
- Applied scaling only to numeric columns | |
4. **Feature Selection** | |
- Removed unimportant columns (`Date`, `Location`) | |
- Ensured feature-target split | |
""") | |
# Step 2: Sample before-after view | |
st.markdown("### 🧾 Sample Data Before & After Cleaning") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("#### 🟥 Raw Data") | |
raw_data = { | |
'MinTemp': [14.1, None], | |
'MaxTemp': [26.5, 24.3], | |
'Rainfall': [0.0, 1.2], | |
'WindGustDir': ['W', None], | |
'RainToday': ['No', 'Yes'] | |
} | |
st.dataframe(pd.DataFrame(raw_data)) | |
with col2: | |
st.markdown("#### 🟩 Cleaned Data") | |
clean_data = { | |
'MinTemp': [14.1, 14.1], | |
'MaxTemp': [26.5, 24.3], | |
'Rainfall': [0.0, 1.2], | |
'WindGustDir_W': [1, 0], | |
'RainToday': [0, 1] | |
} | |
st.dataframe(pd.DataFrame(clean_data)) | |
# Footer | |
st.markdown("<p style='text-align: center; color: gray;'> • Clean Data = Good Model • ", unsafe_allow_html=True) | |
st.markdown("<hr style='border: 0.5px solid gray;'>", unsafe_allow_html=True) | |
st.markdown("#### ✅ Data cleaned and ready! Move on to EDA?") | |
st.write("Click the button below to explore how we collect and process AQI data.") | |
if st.button("➡️ EDA"): | |
st.session_state.page = "advanced_eda" | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
elif st.session_state.page == "advanced_eda": | |
# Preview | |
# Plot 1: RainTomorrow distribution | |
# Plot 2: MinTemp vs Rainfall | |
st.markdown("### 🌡️ Min Temperature vs Rainfall") | |
fig2 = px.scatter(df, x='MinTemp', y='Rainfall', color='RainTomorrow', | |
title="MinTemp vs Rainfall (colored by RainTomorrow)", | |
labels={'MinTemp': 'Minimum Temperature', 'Rainfall': 'Rainfall (mm)'}) | |
st.plotly_chart(fig2, use_container_width=True) | |
# Plot 3: Correlation Heatmap (numeric only) | |
st.markdown("### 📊 Correlation Heatmap (Numeric Features)") | |
numeric_df = df.select_dtypes(include='number').copy() | |
correlation = numeric_df.corr().round(2).reset_index().melt(id_vars='index') | |
correlation.columns = ['Feature1', 'Feature2', 'Correlation'] | |
fig3 = px.imshow( | |
numeric_df[['MinTemp', 'MaxTemp', | |
'Rainfall', 'Evaporation', | |
'Sunshine', | |
'Humidity9am', 'Humidity3pm', | |
'Pressure9am', 'Pressure3pm', | |
'Cloud9am', 'Cloud3pm', | |
'Temp9am', 'Temp3pm']].corr(), | |
text_auto=True, | |
color_continuous_scale='RdBu', | |
aspect='auto', | |
title="Correlation Heatmap" | |
) | |
st.plotly_chart(fig3, use_container_width=True) | |
# Plot 4: Monthly Rainfall Trend (if Date exists) | |
if 'Date' in df.columns and 'Rainfall' in df.columns: | |
st.markdown("### 📆 Average Monthly Rainfall Trend") | |
df['Date'] = pd.to_datetime(df['Date'], errors='coerce') | |
df.dropna(subset=['Date'], inplace=True) | |
df['Month'] = df['Date'].dt.month | |
rain_by_month = df.groupby('Month')['Rainfall'].mean().reset_index() | |
fig4 = px.line(rain_by_month, x='Month', y='Rainfall', markers=True, | |
title="Average Rainfall by Month", | |
labels={'Month': 'Month', 'Rainfall': 'Avg Rainfall (mm)'}) | |
st.plotly_chart(fig4, use_container_width=True) | |
# Plot 5: Optional interactive feature selection | |
st.markdown("### 🧠 Custom Feature Comparison") | |
x_col = st.selectbox("📌 Select X-axis", options=df.select_dtypes(include='number').columns) | |
y_col = st.selectbox("📌 Select Y-axis", options=df.select_dtypes(include='number').columns, index=1) | |
fig5 = px.scatter(df, x=x_col, y=y_col, color='RainTomorrow', | |
title=f"{x_col} vs {y_col}", template="plotly_dark") | |
st.plotly_chart(fig5, use_container_width=True) | |
# CTA | |
st.write("---") | |
st.write("### 🔍 **What's Next?**") | |
st.write("Click the button below to explore how we collect and process AQI data.") | |
if st.button("➡️ Go to Model Building"): | |
st.session_state.page = "model_building" | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
elif st.session_state.page == "model_building": | |
st.markdown("<h1 style='color:#5C33F6;'>🤖 Model Building Summary</h1>", unsafe_allow_html=True) | |
st.markdown("<p style='font-size:16px;'>Overview of classification models and performance evaluation</p>", unsafe_allow_html=True) | |
st.markdown("---") | |
# Description | |
st.markdown("### 🔧 Algorithms Used") | |
st.markdown(""" | |
We explored multiple classification algorithms to predict whether it will rain tomorrow: | |
- **K-Nearest Neighbors (KNN)** | |
- **Decision Tree Classifier** | |
- **Logistic Regression** | |
Each model was tuned using **Optuna**, a hyperparameter optimization library that efficiently searches the best combination of parameters. | |
The best version of each model was then used in three ensemble techniques: | |
- 🗳️ **Voting Classifier** | |
- 🎯 **Bagging Classifier** | |
- 🌲 **Random Forest Classifier** | |
""") | |
# Performance Table | |
st.markdown("### 🌟 Ensemble Model Performance (Classification Metrics)") | |
performance_data = { | |
"Model": ["Voting Classifier", "Bagging Classifier", "Random Forest Classifier"], | |
"Accuracy": [0.67, 0.85, 0.84], | |
"Precision": [0.78, 0.75, 0.8], | |
"Recall": [0.70, 0.74, 0.87], | |
"F1 Score": [0.75, 0.78, 0.82] | |
} | |
df = pd.DataFrame(performance_data) | |
st.table(df) | |
# Visual Comparison | |
# Display as a table | |
if st.button("➡️ Go to Model_testing"): | |
st.session_state.page = "model_testing" | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
elif st.session_state.page == "model_testing": | |
# Title | |
st.markdown("<h1 style='color:#EF476F;'>🧪 Model Testing Summary</h1>", unsafe_allow_html=True) | |
st.markdown("<p style='font-size:16px;'>Final model evaluation on unseen test data</p>", unsafe_allow_html=True) | |
st.markdown("---") | |
# Testing Info | |
st.markdown("### 🧾 Testing Overview") | |
st.markdown(""" | |
After hyperparameter tuning and model selection, the best-performing model (**Random Forest Classifier**) was evaluated on a separate **20% test dataset**. | |
The metrics below represent its performance on real unseen data. | |
""") | |
st.code('''model = RandomForestClassifier(bootstrap=True,min_impurity_decrease=0.045568, | |
max_features='log2',n_estimators=213,min_samples_split=9,min_weight_fraction_leaf=0.082159)''') | |
# Metrics table | |
st.markdown("### 📈 Evaluation Metrics") | |
test_results = { | |
"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC Score"], | |
"Score": [0.89, 0.88, 0.87, 0.88, 0.91] | |
} | |
metrics_df = pd.DataFrame(test_results) | |
st.dataframe(metrics_df) | |
# Confusion Matrix (static representation) | |
# ROC Curve (sample) | |
st.markdown("### 📉 ROC-AUC Curve") | |
fpr = [0.0, 0.1, 0.2, 0.4, 1.0] | |
tpr = [0.0, 0.6, 0.8, 0.9, 1.0] | |
fig2 = go.Figure() | |
fig2.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines+markers', name='ROC Curve', line=dict(color='green'))) | |
fig2.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Baseline', line=dict(dash='dash'))) | |
fig2.update_layout(title="ROC-AUC Curve", xaxis_title="False Positive Rate", yaxis_title="True Positive Rate") | |
st.plotly_chart(fig2, use_container_width=True) | |
# Classification Report Table (optional) | |
st.markdown("### 🧾 Classification Report (Summary)") | |
report = pd.DataFrame({ | |
'Class': ['No Rain', 'Rain'], | |
'Precision': [0.88, 0.87], | |
'Recall': [0.90, 0.85], | |
'F1 Score': [0.89, 0.86], | |
'Support': [940, 560] | |
}) | |
st.dataframe(report.style.format(precision=2)) | |
# Footer | |
st.markdown("<hr style='border: 0.5px solid gray;'>", unsafe_allow_html=True) | |
st.markdown("<p style='text-align: center; color: gray;'>Rain Prediction App • Final Model Testing Results</p>", unsafe_allow_html=True) | |
if st.button("➡️ Go to Model_deployment"): | |
st.session_state.page = "model_deployment" | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |
elif st.session_state.page == "model_deployment": | |
st.write("This model is deployed on huggingface using streamlit library.") | |
st.markdown('CLick below to see the working model👇 ') | |
if st.button("Go to model"): | |
st.switch_page("Model.py") | |
if st.button("➡️ Go to Pipeline"): | |
navigate_to("main") | |