Rain_prediction / pages /ML_pipeline.py
Clone77's picture
Update pages/ML_pipeline.py
8d53abb verified
import streamlit as st
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import math
from PIL import Image
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.preprocessing import RobustScaler, OneHotEncoder,PowerTransformer,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor,RidgeCV,LassoCV
from sklearn.preprocessing import PolynomialFeatures,FunctionTransformer
from sklearn.ensemble import VotingRegressor,BaggingRegressor,RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')
data=pd.read_csv("weatherAUS.csv")
df=data.copy()
# Set page configuration
st.set_page_config(page_title="ML Pipeline", page_icon="⚡", layout="centered")
st.markdown(
"""
<style>
/* Set background color */
.stApp {
background-color: #015551;
color: white;
}
/* Adjust image size */
.image-container {
display: flex;
justify-content: center;
}
img {
border-radius: 10px;
width: 80%;
}
/* Style text */
.title {
text-align: center;
font-size: 28px;
font-weight: bold;
}
.subtitle {
text-align: center;
font-size: 22px;
font-weight: bold;
}
.content {
text-align: justify;
margin-left: auto;
margin-right: auto;
max-width: 80%;
}
""",
unsafe_allow_html=True
)
# Initialize session state for navigation
if "page" not in st.session_state:
st.session_state.page = "main"
# Function to navigate between pages
def navigate_to(page_name):
st.session_state.page = page_name
# Main Page Navigation
if st.session_state.page == "main":
st.markdown("<h1 style='text-align: center; color: #4CAF50;'>Machine Learning Pipeline</h1>", unsafe_allow_html=True)
# Instructions
st.write("Click on any step to view details.")
# Define pipeline steps
steps = [
"Problem Statement", "Data Collection", "Simple EDA", "Data Preprocessing",
"Advanced EDA", "Model Building", "Model Testing", "Model Deployment"
]
# Layout: Two rows with 4 buttons each
cols = st.columns(4)
# Button Click Logic
for i, step in enumerate(steps):
with cols[i % 4]:
if st.button(step, key=step):
navigate_to(step.replace(" ", "_").lower()) # Navigate to the selected step
elif st.session_state.page == 'problem_statement':
col1, col2 = st.columns([1, 5])
with col1:
st.image("https://cdn-icons-png.flaticon.com/512/1146/1146869.png", width=100, caption="")
with col2:
st.markdown(
"<h1 style='color: #38B6FF; padding-top: 20px;'>Rain Prediction Problem Statement</h1>",
unsafe_allow_html=True
)
# 📌 Context (inside expander)
with st.expander("📌 What’s the Problem?"):
st.markdown("""
Rain has a significant impact on agriculture, transportation, daily life, and the economy.
Yet predicting whether it will rain **tomorrow** remains a challenge.
Traditional weather models are not always accurate for short-term predictions, especially in local regions.
Our goal is to use **machine learning** to predict rainfall using today’s observed weather features.
""")
# 🎯 Objective
st.markdown("### 🎯 Our Goal")
st.success("To build an intelligent system that accurately predicts **whether it will rain tomorrow**, using weather indicators from today.")
# 🧠 Dataset Summary in 2-column layout
st.markdown("### 📂 Dataset Highlights")
col1, col2 = st.columns(2)
with col1:
st.metric(label="🌡️ Temperature Fields", value="4 types")
st.metric(label="💧 Humidity & Rainfall", value="3 features")
st.metric(label="☁️ Cloud & Sunshine", value="3 features")
with col2:
st.metric(label="🌬️ Wind Features", value="4 values")
st.metric(label="🧭 Pressure", value="2 features")
st.metric(label="🟰 Target", value="RainTomorrow")
# Interactive card for impact
st.markdown("### 🌍 Real-World Impact")
with st.container():
col1, col2 = st.columns(2)
with col1:
st.info("👨‍🌾 **Farmers** can make informed decisions on irrigation and harvest.")
st.info("🚗 **Commuters** can plan travel during uncertain weather.")
with col2:
st.info("📦 **Logistics** can prepare for potential rain disruptions.")
st.info("🏛️ **Government bodies** can alert regions vulnerable to floods.")
# Optional animation or visual
#st.image("https://cdn-icons-png.flaticon.com/512/1146/1146869.png", width=100, caption="Smart Weather Forecasting")
# CTA
st.markdown("---")
st.markdown("#### ✅ Ready to try the prediction?")
st.markdown("Click below to head to the app and test it in real time!")
if st.button("🔮 Go to Rain Predictor"):
st.switch_page("Model.py")
st.write("---")
st.write("### 🔍 **What’s Next?**")
st.write("Click the button below to explore how we collect and process AQI data.")
if st.button("➡️ Go to Data Collection"):
st.session_state.page = "data_collection"
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
# **Only execute the content when the selected page is "data_collection"**
elif st.session_state.page == "data_collection":
# Header: Icon + Title
col1, col2 = st.columns([1, 5])
with col1:
st.image("https://cdn-icons-png.flaticon.com/512/2460/2460591.png", width=80)
with col2:
st.markdown("<h1 style='color: #00B4D8;'>📊 Data Collection</h1>", unsafe_allow_html=True)
st.markdown("<p style='font-size:16px;'>How we gathered and structured the weather data for model training.</p>", unsafe_allow_html=True)
st.markdown("---")
# Section 1: Data Source
st.markdown("### 🌐 Data Sources")
st.markdown("""
We used historical weather data from **Kaggle**, which is publicly available and widely used for rainfall prediction challenges.
✅ Open-source
✅ Includes daily weather observations
✅ Covers multiple cities and years
""")
# Section 2: Features Overview
with st.expander("🔍 View Collected Features"):
st.markdown("""
- `Date`
- `Location`
- `MinTemp`, `MaxTemp`, `Temp9am`, `Temp3pm`
- `Rainfall`, `Evaporation`, `Sunshine`
- `WindGustDir`, `WindDir9am`, `WindDir3pm`
- `WindGustSpeed`, `WindSpeed9am`, `WindSpeed3pm`
- `Humidity9am`, `Humidity3pm`
- `Pressure9am`, `Pressure3pm`
- `Cloud9am`, `Cloud3pm`
- `RainToday` (Yes/No) → 🧠 Used to predict `RainTomorrow`
""")
# Section 3: Visual Timeline of Collection
st.markdown("### 🕒 Collection Timeline & Scope")
col1, col2 = st.columns(2)
with col1:
st.success("📍 Locations: 49 Australian cities")
st.info("📆 Date Range: 2007 - 2017")
st.warning("🔴 Missing values handled before training")
with col2:
st.image("https://cdn-icons-png.flaticon.com/512/3222/3222800.png", width=150)
# CTA
st.markdown("---")
# Footer
st.markdown("<hr style='border: 0.5px solid gray;'>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center; color: black;'>• Rain Prediction App •", unsafe_allow_html=True)
# Call-to-Action
if st.button("➡️ Go to have a look on Quality of data"):
navigate_to("simple_eda")
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
# **Other Pages Should Not Display Data Collection Content**
elif st.session_state.page == "simple_eda":
with st.expander("📄 Preview Dataset"):
st.dataframe(df.head())
# Overview
st.markdown("### 🧾 Dataset Summary")
col1, col2 = st.columns(2)
with col1:
st.write("**Shape:**", df.shape)
st.write("**Columns:**", df.columns.tolist())
st.dataframe(df.dtypes)
with col2:
st.write("**Missing Values (%):**")
st.dataframe((df.isnull().mean() * 100).round(2))
st.markdown("#### ✅ Next Step: Ready to clean and prepare the data?")
if st.button("🧹 Go to Data Cleaning"):
navigate_to("data_preprocessing")
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
elif st.session_state.page == "data_preprocessing":
col1, col2 = st.columns([1, 5])
with col1:
st.image("https://cdn-icons-png.flaticon.com/512/3242/3242257.png", width=80)
with col2:
st.markdown("<h1 style='color: #00C897;'>🧹 Data Cleaning</h1>", unsafe_allow_html=True)
st.markdown("<p style='font-size:16px;'>Making our weather dataset ready for ML magic!</p>", unsafe_allow_html=True)
st.markdown("---")
# Step 1: Describe cleaning workflow
st.markdown("### 🧼 Cleaning Workflow")
st.write("🛠️ Step-by-step Cleaning Process")
st.markdown("""
1. **Missing Value Handling**
- Dropped rows/columns with excessive missing values
- Used mean/median imputation for numeric columns
- Used mode or 'Unknown' for categorical columns
2. **Categorical Encoding**
- One-hot encoded wind directions (`WindGustDir`, `WindDir9am`, `WindDir3pm`)
- Binary encoding for `RainToday`
3. **Scaling**
- Used `RobustScaler` to reduce the impact of outliers
- Applied scaling only to numeric columns
4. **Feature Selection**
- Removed unimportant columns (`Date`, `Location`)
- Ensured feature-target split
""")
# Step 2: Sample before-after view
st.markdown("### 🧾 Sample Data Before & After Cleaning")
col1, col2 = st.columns(2)
with col1:
st.markdown("#### 🟥 Raw Data")
raw_data = {
'MinTemp': [14.1, None],
'MaxTemp': [26.5, 24.3],
'Rainfall': [0.0, 1.2],
'WindGustDir': ['W', None],
'RainToday': ['No', 'Yes']
}
st.dataframe(pd.DataFrame(raw_data))
with col2:
st.markdown("#### 🟩 Cleaned Data")
clean_data = {
'MinTemp': [14.1, 14.1],
'MaxTemp': [26.5, 24.3],
'Rainfall': [0.0, 1.2],
'WindGustDir_W': [1, 0],
'RainToday': [0, 1]
}
st.dataframe(pd.DataFrame(clean_data))
# Footer
st.markdown("<p style='text-align: center; color: gray;'> • Clean Data = Good Model • ", unsafe_allow_html=True)
st.markdown("<hr style='border: 0.5px solid gray;'>", unsafe_allow_html=True)
st.markdown("#### ✅ Data cleaned and ready! Move on to EDA?")
st.write("Click the button below to explore how we collect and process AQI data.")
if st.button("➡️ EDA"):
st.session_state.page = "advanced_eda"
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
elif st.session_state.page == "advanced_eda":
# Preview
# Plot 1: RainTomorrow distribution
# Plot 2: MinTemp vs Rainfall
st.markdown("### 🌡️ Min Temperature vs Rainfall")
fig2 = px.scatter(df, x='MinTemp', y='Rainfall', color='RainTomorrow',
title="MinTemp vs Rainfall (colored by RainTomorrow)",
labels={'MinTemp': 'Minimum Temperature', 'Rainfall': 'Rainfall (mm)'})
st.plotly_chart(fig2, use_container_width=True)
# Plot 3: Correlation Heatmap (numeric only)
st.markdown("### 📊 Correlation Heatmap (Numeric Features)")
numeric_df = df.select_dtypes(include='number').copy()
correlation = numeric_df.corr().round(2).reset_index().melt(id_vars='index')
correlation.columns = ['Feature1', 'Feature2', 'Correlation']
fig3 = px.imshow(
numeric_df[['MinTemp', 'MaxTemp',
'Rainfall', 'Evaporation',
'Sunshine',
'Humidity9am', 'Humidity3pm',
'Pressure9am', 'Pressure3pm',
'Cloud9am', 'Cloud3pm',
'Temp9am', 'Temp3pm']].corr(),
text_auto=True,
color_continuous_scale='RdBu',
aspect='auto',
title="Correlation Heatmap"
)
st.plotly_chart(fig3, use_container_width=True)
# Plot 4: Monthly Rainfall Trend (if Date exists)
if 'Date' in df.columns and 'Rainfall' in df.columns:
st.markdown("### 📆 Average Monthly Rainfall Trend")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.dropna(subset=['Date'], inplace=True)
df['Month'] = df['Date'].dt.month
rain_by_month = df.groupby('Month')['Rainfall'].mean().reset_index()
fig4 = px.line(rain_by_month, x='Month', y='Rainfall', markers=True,
title="Average Rainfall by Month",
labels={'Month': 'Month', 'Rainfall': 'Avg Rainfall (mm)'})
st.plotly_chart(fig4, use_container_width=True)
# Plot 5: Optional interactive feature selection
st.markdown("### 🧠 Custom Feature Comparison")
x_col = st.selectbox("📌 Select X-axis", options=df.select_dtypes(include='number').columns)
y_col = st.selectbox("📌 Select Y-axis", options=df.select_dtypes(include='number').columns, index=1)
fig5 = px.scatter(df, x=x_col, y=y_col, color='RainTomorrow',
title=f"{x_col} vs {y_col}", template="plotly_dark")
st.plotly_chart(fig5, use_container_width=True)
# CTA
st.write("---")
st.write("### 🔍 **What's Next?**")
st.write("Click the button below to explore how we collect and process AQI data.")
if st.button("➡️ Go to Model Building"):
st.session_state.page = "model_building"
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
elif st.session_state.page == "model_building":
st.markdown("<h1 style='color:#5C33F6;'>🤖 Model Building Summary</h1>", unsafe_allow_html=True)
st.markdown("<p style='font-size:16px;'>Overview of classification models and performance evaluation</p>", unsafe_allow_html=True)
st.markdown("---")
# Description
st.markdown("### 🔧 Algorithms Used")
st.markdown("""
We explored multiple classification algorithms to predict whether it will rain tomorrow:
- **K-Nearest Neighbors (KNN)**
- **Decision Tree Classifier**
- **Logistic Regression**
Each model was tuned using **Optuna**, a hyperparameter optimization library that efficiently searches the best combination of parameters.
The best version of each model was then used in three ensemble techniques:
- 🗳️ **Voting Classifier**
- 🎯 **Bagging Classifier**
- 🌲 **Random Forest Classifier**
""")
# Performance Table
st.markdown("### 🌟 Ensemble Model Performance (Classification Metrics)")
performance_data = {
"Model": ["Voting Classifier", "Bagging Classifier", "Random Forest Classifier"],
"Accuracy": [0.67, 0.85, 0.84],
"Precision": [0.78, 0.75, 0.8],
"Recall": [0.70, 0.74, 0.87],
"F1 Score": [0.75, 0.78, 0.82]
}
df = pd.DataFrame(performance_data)
st.table(df)
# Visual Comparison
# Display as a table
if st.button("➡️ Go to Model_testing"):
st.session_state.page = "model_testing"
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
elif st.session_state.page == "model_testing":
# Title
st.markdown("<h1 style='color:#EF476F;'>🧪 Model Testing Summary</h1>", unsafe_allow_html=True)
st.markdown("<p style='font-size:16px;'>Final model evaluation on unseen test data</p>", unsafe_allow_html=True)
st.markdown("---")
# Testing Info
st.markdown("### 🧾 Testing Overview")
st.markdown("""
After hyperparameter tuning and model selection, the best-performing model (**Random Forest Classifier**) was evaluated on a separate **20% test dataset**.
The metrics below represent its performance on real unseen data.
""")
st.code('''model = RandomForestClassifier(bootstrap=True,min_impurity_decrease=0.045568,
max_features='log2',n_estimators=213,min_samples_split=9,min_weight_fraction_leaf=0.082159)''')
# Metrics table
st.markdown("### 📈 Evaluation Metrics")
test_results = {
"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC-AUC Score"],
"Score": [0.89, 0.88, 0.87, 0.88, 0.91]
}
metrics_df = pd.DataFrame(test_results)
st.dataframe(metrics_df)
# Confusion Matrix (static representation)
# ROC Curve (sample)
st.markdown("### 📉 ROC-AUC Curve")
fpr = [0.0, 0.1, 0.2, 0.4, 1.0]
tpr = [0.0, 0.6, 0.8, 0.9, 1.0]
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines+markers', name='ROC Curve', line=dict(color='green')))
fig2.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Baseline', line=dict(dash='dash')))
fig2.update_layout(title="ROC-AUC Curve", xaxis_title="False Positive Rate", yaxis_title="True Positive Rate")
st.plotly_chart(fig2, use_container_width=True)
# Classification Report Table (optional)
st.markdown("### 🧾 Classification Report (Summary)")
report = pd.DataFrame({
'Class': ['No Rain', 'Rain'],
'Precision': [0.88, 0.87],
'Recall': [0.90, 0.85],
'F1 Score': [0.89, 0.86],
'Support': [940, 560]
})
st.dataframe(report.style.format(precision=2))
# Footer
st.markdown("<hr style='border: 0.5px solid gray;'>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center; color: gray;'>Rain Prediction App • Final Model Testing Results</p>", unsafe_allow_html=True)
if st.button("➡️ Go to Model_deployment"):
st.session_state.page = "model_deployment"
if st.button("➡️ Go to Pipeline"):
navigate_to("main")
elif st.session_state.page == "model_deployment":
st.write("This model is deployed on huggingface using streamlit library.")
st.markdown('CLick below to see the working model👇 ')
if st.button("Go to model"):
st.switch_page("Model.py")
if st.button("➡️ Go to Pipeline"):
navigate_to("main")