reysarms's picture
updated app.py
56a3735
import os
os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
os.environ["STREAMLIT_WATCH_DISABLE"] = "true"
import streamlit as st
from PIL import Image
import torch
import torchvision.transforms as transforms
import pandas as pd
from utils.preprocessing import get_transforms
from models.resnet_model import ResNet18
# Class names in order
class_names = [
'calling', 'clapping', 'cycling', 'dancing', 'drinking', 'eating', 'fighting',
'hugging', 'laughing', 'listening_to_music', 'running', 'sitting', 'sleeping',
'texting', 'using_laptop'
]
@st.cache_resource
def load_model():
if not os.path.exists("models/best_model.pth"):
st.error("Model weights not found. Please ensure 'models/best_model.pth' exists.")
st.stop()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet18(num_classes=15)
model.load_state_dict(torch.load("models/best_model.pth", map_location=device))
model.to(device)
model.eval()
return model, device
def predict(image, model, device):
transform = get_transforms()
image_t = transform(image).unsqueeze(0).to(device)
with torch.no_grad():
outputs = model(image_t)
probs = torch.softmax(outputs, dim=1)
conf, predicted = torch.max(probs, dim=1)
return class_names[predicted.item()], float(conf.item()) # type: ignore
def main():
st.title("Human Action Recognition App")
tab1, tab2, tab3 = st.tabs(["About", "Predict", "Metrics & Test Predictions"])
with tab1:
st.header("About This App")
st.markdown("""
### 🧠 Human Action Recognition (HAR)
This application classifies **human actions** from static images using a deep learning model trained on a curated dataset of 15 different activities.
#### 🔍 Purpose
To demonstrate how computer vision and deep learning can be used to **recognize and classify human behaviors** in images — useful for applications such as surveillance, activity monitoring, and human-computer interaction.
#### 🧰 Model
- **Architecture:** ResNet18 (Residual Neural Network with 18 layers)
- **Pretrained:** On ImageNet for general features
- **Fine-tuned:** On a specialized Human Action Recognition dataset for task-specific learning
#### 📚 Dataset
- **Source:** [Bingsu/Human_Action_Recognition](https://huggingface.co/datasets/Bingsu/Human_Action_Recognition)
- **Categories:** 15 action classes
- `calling`, `clapping`, `cycling`, `dancing`, `drinking`, `eating`, `fighting`,
`hugging`, `laughing`, `listening_to_music`, `running`, `sitting`,
`sleeping`, `texting`, `using_laptop`
---
🖼️ Simply upload an image, and the model will analyze and classify the dominant action being performed.
""")
with tab2:
st.header("Predict Human Action from Image")
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
try:
image = Image.open(uploaded_file).convert("RGB")
except Exception:
st.error("Error loading image. Please upload a valid JPG or PNG file.")
return
st.image(image, caption="Uploaded Image", use_container_width=True)
model, device = load_model()
prediction, confidence = predict(image, model, device)
pred_label = prediction.replace('_', ' ').title()
st.success(f"Predicted Action: **{pred_label}**")
st.info(f"Confidence: {confidence*100:.2f}%")
# Show transformed input
transform = get_transforms()
transformed_image = transform(image)
st.image(transforms.ToPILImage()(transformed_image), caption="Transformed Input (for model)", use_container_width=True)
# Detailed explanation based on predicted output
st.markdown(f"""
### About the Transformed Input for **{pred_label}**
Before the model makes its prediction, the uploaded image undergoes several preprocessing steps to prepare it for analysis:
- **Resizing and cropping:** The image is resized and cropped to a consistent size (usually 224x224 pixels) so that the model receives uniform input dimensions.
- **Normalization:** Pixel color values are scaled based on mean and standard deviation values (typically from ImageNet dataset statistics). This helps the model generalize better by standardizing the input distribution.
- **Conversion to Tensor:** The image is converted from a PIL image to a PyTorch tensor, which is the required input format for the model.
This processed image is exactly what the model "sees" when it predicts the action **{pred_label}**. Understanding this helps ensure the model's input is consistent and reliable.
""")
with tab3:
st.header("Training & Validation Metrics")
st.markdown("""
**Training Accuracy (96.5%)**
During training, the model correctly identified human actions in 96.5% of the images. This indicates it has effectively learned the patterns and features present in the training data.
**Validation Accuracy (96.6%)**
When evaluated on new, unseen images, the model correctly classified 96.6% of them. This demonstrates its ability to generalize knowledge beyond simply memorizing the training examples.
**Training Loss (0.12)**
The average prediction error during training was low (0.12), meaning the model’s guesses are generally close to the true labels.
**Validation Loss (0.10)**
On unseen data, the prediction error was even lower (0.10), suggesting the model is not overfitting but genuinely learning to understand the task.
""")
st.markdown("---")
st.header("Test Set Predictions Preview")
st.markdown("""
The table below presents a sample of the model’s predictions on the test dataset, which consists of images the model has not encountered during training. The columns typically include:
- **Filename:** The name of the test image file
- **Predicted Label:** The human action predicted by the model
- **Confidence:** The model’s confidence score for each prediction
Reviewing this information aids in evaluating the model’s real-world performance and helps identify potential failure cases.
""")
csv_path = "test_predictions.csv"
if os.path.exists(csv_path):
df = pd.read_csv(csv_path)
st.dataframe(df.head(20)) # show first 20 rows
st.success(f"Loaded {len(df)} test predictions.")
else:
st.warning(f"Test predictions CSV file not found at: {csv_path}")
if __name__ == "__main__":
main()