Spaces:

reysarms
/

ccs_229_final_project

Sleeping

File size: 6,969 Bytes

import os
os.environ["STREAMLIT_WATCHER_TYPE"] = "none"  
os.environ["STREAMLIT_WATCH_DISABLE"] = "true"  

import streamlit as st
from PIL import Image
import torch
import torchvision.transforms as transforms
import pandas as pd

from utils.preprocessing import get_transforms
from models.resnet_model import ResNet18

# Class names in order
class_names = [
    'calling', 'clapping', 'cycling', 'dancing', 'drinking', 'eating', 'fighting',
    'hugging', 'laughing', 'listening_to_music', 'running', 'sitting', 'sleeping',
    'texting', 'using_laptop'
]

@st.cache_resource
def load_model():
    if not os.path.exists("models/best_model.pth"):
        st.error("Model weights not found. Please ensure 'models/best_model.pth' exists.")
        st.stop()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ResNet18(num_classes=15)
    model.load_state_dict(torch.load("models/best_model.pth", map_location=device))
    model.to(device)
    model.eval()
    return model, device

def predict(image, model, device):
    transform = get_transforms()
    image_t = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(image_t)
        probs = torch.softmax(outputs, dim=1)
        conf, predicted = torch.max(probs, dim=1)
    return class_names[predicted.item()], float(conf.item()) # type: ignore


def main():
    st.title("Human Action Recognition App")
    tab1, tab2, tab3 = st.tabs(["About", "Predict", "Metrics & Test Predictions"])

    with tab1:
        st.header("About This App")
        st.markdown("""
        ### 🧠 Human Action Recognition (HAR)

        This application classifies **human actions** from static images using a deep learning model trained on a curated dataset of 15 different activities.

        #### 🔍 Purpose
        To demonstrate how computer vision and deep learning can be used to **recognize and classify human behaviors** in images — useful for applications such as surveillance, activity monitoring, and human-computer interaction.

        #### 🧰 Model
        - **Architecture:** ResNet18 (Residual Neural Network with 18 layers)  
        - **Pretrained:** On ImageNet for general features  
        - **Fine-tuned:** On a specialized Human Action Recognition dataset for task-specific learning

        #### 📚 Dataset
        - **Source:** [Bingsu/Human_Action_Recognition](https://huggingface.co/datasets/Bingsu/Human_Action_Recognition)  
        - **Categories:** 15 action classes  
          - `calling`, `clapping`, `cycling`, `dancing`, `drinking`, `eating`, `fighting`,  
            `hugging`, `laughing`, `listening_to_music`, `running`, `sitting`,  
            `sleeping`, `texting`, `using_laptop`

        ---
        🖼️ Simply upload an image, and the model will analyze and classify the dominant action being performed.
        """)

    with tab2:
        st.header("Predict Human Action from Image")
        uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

        if uploaded_file is not None:
            try:
                image = Image.open(uploaded_file).convert("RGB")
            except Exception:
                st.error("Error loading image. Please upload a valid JPG or PNG file.")
                return

            st.image(image, caption="Uploaded Image", use_container_width=True)

            model, device = load_model()
            prediction, confidence = predict(image, model, device)
            pred_label = prediction.replace('_', ' ').title()
            st.success(f"Predicted Action: **{pred_label}**")
            st.info(f"Confidence: {confidence*100:.2f}%")

            # Show transformed input
            transform = get_transforms()
            transformed_image = transform(image)
            st.image(transforms.ToPILImage()(transformed_image), caption="Transformed Input (for model)", use_container_width=True)

            # Detailed explanation based on predicted output
            st.markdown(f"""
            ### About the Transformed Input for **{pred_label}**

            Before the model makes its prediction, the uploaded image undergoes several preprocessing steps to prepare it for analysis:

            - **Resizing and cropping:** The image is resized and cropped to a consistent size (usually 224x224 pixels) so that the model receives uniform input dimensions.
            - **Normalization:** Pixel color values are scaled based on mean and standard deviation values (typically from ImageNet dataset statistics). This helps the model generalize better by standardizing the input distribution.
            - **Conversion to Tensor:** The image is converted from a PIL image to a PyTorch tensor, which is the required input format for the model.

            This processed image is exactly what the model "sees" when it predicts the action **{pred_label}**. Understanding this helps ensure the model's input is consistent and reliable.
            """)

    with tab3:
        st.header("Training & Validation Metrics")

        st.markdown("""
        **Training Accuracy (96.5%)**  
        During training, the model correctly identified human actions in 96.5% of the images. This indicates it has effectively learned the patterns and features present in the training data.

        **Validation Accuracy (96.6%)**  
        When evaluated on new, unseen images, the model correctly classified 96.6% of them. This demonstrates its ability to generalize knowledge beyond simply memorizing the training examples.

        **Training Loss (0.12)**  
        The average prediction error during training was low (0.12), meaning the model’s guesses are generally close to the true labels.

        **Validation Loss (0.10)**  
        On unseen data, the prediction error was even lower (0.10), suggesting the model is not overfitting but genuinely learning to understand the task.
        """)

        st.markdown("---")
        st.header("Test Set Predictions Preview")

        st.markdown("""
        The table below presents a sample of the model’s predictions on the test dataset, which consists of images the model has not encountered during training. The columns typically include:

        - **Filename:** The name of the test image file  
        - **Predicted Label:** The human action predicted by the model  
        - **Confidence:** The model’s confidence score for each prediction  

        Reviewing this information aids in evaluating the model’s real-world performance and helps identify potential failure cases.
        """)

        csv_path = "test_predictions.csv"
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            st.dataframe(df.head(20))  # show first 20 rows
            st.success(f"Loaded {len(df)} test predictions.")
        else:
            st.warning(f"Test predictions CSV file not found at: {csv_path}")

if __name__ == "__main__":
    main()