import os os.environ["STREAMLIT_WATCHER_TYPE"] = "none" os.environ["STREAMLIT_WATCH_DISABLE"] = "true" import streamlit as st from PIL import Image import torch import torchvision.transforms as transforms import pandas as pd from utils.preprocessing import get_transforms from models.resnet_model import ResNet18 # Class names in order class_names = [ 'calling', 'clapping', 'cycling', 'dancing', 'drinking', 'eating', 'fighting', 'hugging', 'laughing', 'listening_to_music', 'running', 'sitting', 'sleeping', 'texting', 'using_laptop' ] @st.cache_resource def load_model(): if not os.path.exists("models/best_model.pth"): st.error("Model weights not found. Please ensure 'models/best_model.pth' exists.") st.stop() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = ResNet18(num_classes=15) model.load_state_dict(torch.load("models/best_model.pth", map_location=device)) model.to(device) model.eval() return model, device def predict(image, model, device): transform = get_transforms() image_t = transform(image).unsqueeze(0).to(device) with torch.no_grad(): outputs = model(image_t) probs = torch.softmax(outputs, dim=1) conf, predicted = torch.max(probs, dim=1) return class_names[predicted.item()], float(conf.item()) # type: ignore def main(): st.title("Human Action Recognition App") tab1, tab2, tab3 = st.tabs(["About", "Predict", "Metrics & Test Predictions"]) with tab1: st.header("About This App") st.markdown(""" ### 🧠 Human Action Recognition (HAR) This application classifies **human actions** from static images using a deep learning model trained on a curated dataset of 15 different activities. #### 🔍 Purpose To demonstrate how computer vision and deep learning can be used to **recognize and classify human behaviors** in images — useful for applications such as surveillance, activity monitoring, and human-computer interaction. #### 🧰 Model - **Architecture:** ResNet18 (Residual Neural Network with 18 layers) - **Pretrained:** On ImageNet for general features - **Fine-tuned:** On a specialized Human Action Recognition dataset for task-specific learning #### 📚 Dataset - **Source:** [Bingsu/Human_Action_Recognition](https://huggingface.co/datasets/Bingsu/Human_Action_Recognition) - **Categories:** 15 action classes - `calling`, `clapping`, `cycling`, `dancing`, `drinking`, `eating`, `fighting`, `hugging`, `laughing`, `listening_to_music`, `running`, `sitting`, `sleeping`, `texting`, `using_laptop` --- 🖼️ Simply upload an image, and the model will analyze and classify the dominant action being performed. """) with tab2: st.header("Predict Human Action from Image") uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: try: image = Image.open(uploaded_file).convert("RGB") except Exception: st.error("Error loading image. Please upload a valid JPG or PNG file.") return st.image(image, caption="Uploaded Image", use_container_width=True) model, device = load_model() prediction, confidence = predict(image, model, device) pred_label = prediction.replace('_', ' ').title() st.success(f"Predicted Action: **{pred_label}**") st.info(f"Confidence: {confidence*100:.2f}%") # Show transformed input transform = get_transforms() transformed_image = transform(image) st.image(transforms.ToPILImage()(transformed_image), caption="Transformed Input (for model)", use_container_width=True) # Detailed explanation based on predicted output st.markdown(f""" ### About the Transformed Input for **{pred_label}** Before the model makes its prediction, the uploaded image undergoes several preprocessing steps to prepare it for analysis: - **Resizing and cropping:** The image is resized and cropped to a consistent size (usually 224x224 pixels) so that the model receives uniform input dimensions. - **Normalization:** Pixel color values are scaled based on mean and standard deviation values (typically from ImageNet dataset statistics). This helps the model generalize better by standardizing the input distribution. - **Conversion to Tensor:** The image is converted from a PIL image to a PyTorch tensor, which is the required input format for the model. This processed image is exactly what the model "sees" when it predicts the action **{pred_label}**. Understanding this helps ensure the model's input is consistent and reliable. """) with tab3: st.header("Training & Validation Metrics") st.markdown(""" **Training Accuracy (96.5%)** During training, the model correctly identified human actions in 96.5% of the images. This indicates it has effectively learned the patterns and features present in the training data. **Validation Accuracy (96.6%)** When evaluated on new, unseen images, the model correctly classified 96.6% of them. This demonstrates its ability to generalize knowledge beyond simply memorizing the training examples. **Training Loss (0.12)** The average prediction error during training was low (0.12), meaning the model’s guesses are generally close to the true labels. **Validation Loss (0.10)** On unseen data, the prediction error was even lower (0.10), suggesting the model is not overfitting but genuinely learning to understand the task. """) st.markdown("---") st.header("Test Set Predictions Preview") st.markdown(""" The table below presents a sample of the model’s predictions on the test dataset, which consists of images the model has not encountered during training. The columns typically include: - **Filename:** The name of the test image file - **Predicted Label:** The human action predicted by the model - **Confidence:** The model’s confidence score for each prediction Reviewing this information aids in evaluating the model’s real-world performance and helps identify potential failure cases. """) csv_path = "test_predictions.csv" if os.path.exists(csv_path): df = pd.read_csv(csv_path) st.dataframe(df.head(20)) # show first 20 rows st.success(f"Loaded {len(df)} test predictions.") else: st.warning(f"Test predictions CSV file not found at: {csv_path}") if __name__ == "__main__": main()