Spaces:

DurgaDeepak
/

UVIS

Running on Zero

App Files Files Community

DurgaDeepak commited on 29 days ago

Commit

b23251f

verified ·

1 Parent(s): eb90293

Main Commit

Browse files

Files from private github repo

Files changed (22) hide show

.gitattributes +7 -35
.venv/Scripts/python.exe +3 -0
.venv/Scripts/pythonw.exe +3 -0
.venv/pyvenv.cfg +3 -0
Dockerfile +23 -0
LICENSE +201 -0
README.md +94 -14
app.py +464 -0
assets/sample_images/Man_in_office.jpg +0 -0
assets/sample_images/Street_in_Japan.jpg +3 -0
assets/ui/logo.png +3 -0
core/describe_scene.py +59 -0
models/__init__.py +0 -0
models/depth/depth_estimator.py +85 -0
models/detection/detector.py +73 -0
models/segmentation/segmenter.py +89 -0
registry.py +43 -0
requirements.txt +22 -0
utils/file_utils.py +17 -0
utils/math_utils.py +17 -0
utils/model_downloader.py +46 -0
utils/video_utils.py +34 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,7 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Auto detect text files and perform LF normalization
+* text=auto
+*.pt filter=lfs diff=lfs merge=lfs -text
+.venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
+.venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
+assets/sample_images/Street_in_Japan.jpg filter=lfs diff=lfs merge=lfs -text
+assets/ui/logo.png filter=lfs diff=lfs merge=lfs -text

.venv/Scripts/python.exe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beefaea165effa6069ba50bdd4d3a5cb7bcd6173629dd879af45985129e9038b
+size 242920

.venv/Scripts/pythonw.exe ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f67a7ae6f44fa2c2892ad83757baaf18b5b3be9f6becac66d6d6fea41c19819
+size 232688

.venv/pyvenv.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+home = D:\
+include-system-site-packages = false
+version = 3.10.0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Base Python image
+FROM python:3.10-slim
+# Install OS dependencies
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy all files
+COPY . .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose port (Streamlit default)
+EXPOSE 8501
+# Run Streamlit app
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,14 +1,94 @@
----
-title: UVIS
-emoji: 📚
-colorFrom: green
-colorTo: red
-sdk: gradio
-sdk_version: 5.29.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Unified Visual Intelligence System
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# UVIS - Unified Visual Intelligence System
+### A Lightweight Web-Based Visual Perception Demo
+> **Try it online**: [uvis.deecoded.io](https://uvis.deecoded.io)
+> **GitHub**: [github.com/DurgaDeepakValluri/UVIS](https://github.com/DurgaDeepakValluri/UVIS)
+---
+## Overview
+**UVIS** (Unified Visual Intelligence System) is a **lightweight, web-based visual perception demo**, originally conceptualized as a **spin-off while building Percepta**—a larger modular perception framework.
+The goal of UVIS is to make **scene understanding tools more accessible**, allowing anyone to try object detection, semantic segmentation, and depth estimation through a clean web interface, without requiring local setup.
+UVIS currently runs on **[Render.com](https://www.render.com)'s Free Tier**, using **lightweight models** to ensure the experience remains stable on limited resources.
+---
+## Key Features
+| Capability                   | Description                                                                         |
+| ---------------------------- | ----------------------------------------------------------------------------------- |
+| 🟢 **Object Detection**      | YOLOv5-Nano & YOLOv5-Small for fast, low-resource detection.                        |
+| 🟢 **Semantic Segmentation** | SegFormer-B0 and DeepLabV3-ResNet50 for general-purpose scenes.                     |
+| 🟢 **Depth Estimation**      | MiDaS Small & DPT Lite for per-pixel depth estimation.                              |
+| 🖼️ **Scene Blueprint**      | Unified overlay combining all selected tasks.                                       |
+| 📊 **Scene Metrics**         | Scene complexity scoring and agent-friendly summaries.                              |
+| 📦 **Downloadable Results**  | JSON, overlay images, and ZIP bundles.                                              |
+| 🌐 **Web-First Design**      | No installation needed—hosted live at [uvis.deecoded.io](https://uvis.deecoded.io). |
+| 🛠️ **Open Source**          | Contribution-friendly, easy to extend and improve.                                  |
+---
+### Current Limitations & Roadmap
+UVIS is designed for **lightweight demos** on **free-tier hosting**, which means:
+* Models are optimized for speed and minimal compute.
+* Only **image input** is supported at this time.
+> As the project grows and higher hosting tiers become available, the roadmap includes:
+>
+> *  **Video input support**
+> *  **Lightweight SLAM**
+> *  **Natural language scene descriptions**
+> *  **Higher-capacity, more accurate models**
+---
+## Architecture Highlights
+* **Modular Python Backend with Model Registry**
+* **Streamlit-Based Interactive Web UI**
+* **HuggingFace Transformers & TorchVision Integration**
+* **Lightweight Model Support (Render-Compatible)**
+* **Structured JSON Output for AI Agents**
+* **Robust Error Handling and Logging**
+---
+## 🤝 Contributing
+UVIS is **open-source** and welcomes contributions.
+You can:
+* Suggest new features
+* Improve the web interface
+* Extend perception tasks
+* Report issues or bugs
+### 💻 **Clone and Run Locally**
+```bash
+git clone https://github.com/DurgaDeepakValluri/UVIS.git
+cd UVIS
+pip install -r requirements.txt
+```
+---
+## 🌐 Live Demo
+> **Explore it online at [uvis.deecoded.io](https://uvis.deecoded.io)**
+> Upload an image, select your tasks, and view the results—all in your browser.
+---
+## 📝 License
+Apache 2.0 License. Free for personal and commercial use with attribution.
+© 2025 Durga Deepak Valluri

app.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# UVIS - Gradio App with Upload, URL & Video Support
+"""
+This script launches the UVIS (Unified Visual Intelligence System) as a Gradio Web App.
+Supports image, video, and URL-based media inputs for detection, segmentation, and depth estimation.
+Outputs include scene blueprint, structured JSON, and downloadable results.
+"""
+import gradio as gr
+from PIL import Image
+import numpy as np
+import os
+import io
+import zipfile
+import json
+import tempfile
+import logging
+import cv2
+import requests
+from urllib.parse import urlparse
+from registry import get_model
+from core.describe_scene import describe_scene
+import uuid
+import time
+import timeout_decorator
+import socket
+import ipaddress
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Model mappings
+DETECTION_MODEL_MAP = {
+    "YOLOv5-Nano": "yolov5n-seg",
+    "YOLOv5-Small": "yolov5s-seg",
+    "YOLOv8-Small": "yolov8s",
+    "YOLOv8-Large": "yolov8l",
+    "RT-DETR": "rtdetr"  # For future support
+}
+SEGMENTATION_MODEL_MAP = {
+    "SegFormer-B0": "nvidia/segformer-b0-finetuned-ade-512-512",
+    "SegFormer-B5": "nvidia/segformer-b5-finetuned-ade-512-512",
+    "DeepLabV3-ResNet50": "deeplabv3_resnet50"
+}
+DEPTH_MODEL_MAP = {
+    "MiDaS v21 Small 256": "midas_v21_small_256",
+    "MiDaS v21 384": "midas_v21_384",
+    "DPT Hybrid 384": "dpt_hybrid_384",
+    "DPT Swin2 Large 384": "dpt_swin2_large_384",
+    "DPT Beit Large 512": "dpt_beit_large_512"
+}
+# Resource Limits
+MAX_IMAGE_MB = 5
+MAX_IMAGE_RES = (1920, 1080)
+MAX_VIDEO_MB = 50
+MAX_VIDEO_DURATION = 30  # seconds
+# Utility Functions
+def format_error(message):
+    """Formats error messages for consistent user feedback."""
+    return {"error": message}
+def toggle_visibility(show, *components):
+    """Toggles visibility for multiple Gradio components."""
+    return [gr.update(visible=show) for _ in components]
+def generate_session_id():
+    """Generates a unique session ID for tracking inputs."""
+    return str(uuid.uuid4())
+def log_runtime(start_time):
+    """Logs the runtime of a process."""
+    elapsed_time = time.time() - start_time
+    logger.info(f"Process completed in {elapsed_time:.2f} seconds.")
+    return elapsed_time
+def is_public_ip(url):
+    """
+    Checks whether the resolved IP address of a URL is public (non-local).
+    Prevents SSRF by blocking internal addresses like 127.0.0.1 or 192.168.x.x.
+    """
+    try:
+        hostname = urlparse(url).hostname
+        ip = socket.gethostbyname(hostname)
+        ip_obj = ipaddress.ip_address(ip)
+        return ip_obj.is_global  # Only allow globally routable IPs
+    except Exception as e:
+        logger.warning(f"URL IP validation failed: {e}")
+        return False
+def fetch_media_from_url(url):
+    """
+    Downloads media from a URL. Supports images and videos.
+    Returns PIL.Image or video file path.
+    """
+    logger.info(f"Fetching media from URL: {url}")
+    if not is_public_ip(url):
+        logger.warning("Blocked non-public URL request (possible SSRF).")
+        return None
+    try:
+        parsed_url = urlparse(url)
+        ext = os.path.splitext(parsed_url.path)[-1].lower()
+        headers = {"User-Agent": "Mozilla/5.0"}
+        r = requests.get(url, headers=headers, timeout=10)
+        if r.status_code != 200 or len(r.content) > 50 * 1024 * 1024:
+            logger.warning(f"Download failed or file too large.")
+            return None
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
+        tmp_file.write(r.content)
+        tmp_file.close()
+        if ext in [".jpg", ".jpeg", ".png"]:
+            return Image.open(tmp_file.name).convert("RGB")
+        elif ext in [".mp4", ".avi", ".mov"]:
+            return tmp_file.name
+        else:
+            logger.warning("Unsupported file type from URL.")
+            return None
+    except Exception as e:
+        logger.error(f"URL fetch failed: {e}")
+        return None
+# Input Validation Functions
+def validate_image(img):
+    """
+    Validates the uploaded image based on size and resolution limits.
+    Args:
+        img (PIL.Image.Image): Image to validate.
+    Returns:
+        Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
+    """
+    logger.info("Validating uploaded image.")
+    try:
+        buffer = io.BytesIO()
+        img.save(buffer, format="PNG")
+        size_mb = len(buffer.getvalue()) / (1024 * 1024)
+        if size_mb > MAX_IMAGE_MB:
+            logger.warning("Image exceeds size limit of 5MB.")
+            return False, "Image exceeds 5MB limit."
+        if img.width > MAX_IMAGE_RES[0] or img.height > MAX_IMAGE_RES[1]:
+            logger.warning("Image resolution exceeds 1920x1080.")
+            return False, "Image resolution exceeds 1920x1080."
+        logger.info("Image validation passed.")
+        return True, None
+    except Exception as e:
+        logger.error(f"Error validating image: {e}")
+        return False, str(e)
+def validate_video(path):
+    """
+    Validates the uploaded video based on size and duration limits.
+    Args:
+        path (str): Path to the video file.
+    Returns:
+        Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
+    """
+    logger.info(f"Validating video file at: {path}")
+    try:
+        size_mb = os.path.getsize(path) / (1024 * 1024)
+        if size_mb > MAX_VIDEO_MB:
+            logger.warning("Video exceeds size limit of 50MB.")
+            return False, "Video exceeds 50MB limit."
+        cap = cv2.VideoCapture(path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+        duration = frames / fps if fps else 0
+        cap.release()
+        if duration > MAX_VIDEO_DURATION:
+            logger.warning("Video exceeds 30 seconds duration limit.")
+            return False, "Video exceeds 30 seconds duration limit."
+        logger.info("Video validation passed.")
+        return True, None
+    except Exception as e:
+        logger.error(f"Error validating video: {e}")
+        return False, str(e)
+# Input Resolution
+def resolve_input(mode, uploaded_img, uploaded_imgs, uploaded_vid, url):
+    """
+    Resolves the input source based on user selection.
+    Supports single image, multiple images, video, or URL-based media.
+    Args:
+        mode (str): Input mode - 'Upload' or 'URL'.
+        uploaded_img (PIL.Image.Image): Single uploaded image.
+        uploaded_imgs (List[PIL.Image.Image]): List of uploaded images (batch).
+        uploaded_vid (str): Uploaded video file path.
+        url (str): URL pointing to media content.
+    Returns:
+        List[Union[PIL.Image.Image, str, None]]: A list of media items to process.
+    """
+    logger.info(f"Resolving input based on mode: {mode}")
+    try:
+        if mode == "Upload":
+            # Prefer batch if provided
+            if uploaded_imgs and len(uploaded_imgs) > 0:
+                return uploaded_imgs
+            elif uploaded_img:
+                return [uploaded_img]
+            elif uploaded_vid:
+                return [uploaded_vid]
+            else:
+                logger.warning("No valid upload provided.")
+                return None
+        elif mode == "URL":
+            media_from_url = fetch_media_from_url(url)
+            if media_from_url:
+                return [media_from_url]
+            else:
+                logger.warning("Failed to fetch valid media from URL.")
+                return None
+        else:
+            logger.warning("Invalid input mode selected.")
+            return None
+    except Exception as e:
+        logger.error(f"Error resolving input: {e}")
+        return None
+@timeout_decorator.timeout(35, use_signals=False)  # 35 sec limit per image
+def process_image(
+    image: Image.Image,
+    run_det: bool,
+    det_model: str,
+    det_confidence: float,
+    run_seg: bool,
+    seg_model: str,
+    run_depth: bool,
+    depth_model: str,
+    blend: float
+):
+    """
+    Runs selected perception tasks on the input image and packages results.
+    Args:
+        image (PIL.Image): Input image.
+        run_det (bool): Run object detection.
+        det_model (str): Detection model key.
+        det_confidence (float): Detection confidence threshold.
+        run_seg (bool): Run segmentation.
+        seg_model (str): Segmentation model key.
+        run_depth (bool): Run depth estimation.
+        depth_model (str): Depth model key.
+        blend (float): Overlay blend alpha (0.0 - 1.0).
+    Returns:
+        Tuple[Image, dict, Tuple[str, bytes]]: Final image, scene JSON, and downloadable ZIP.
+    """
+    logger.info("Starting image processing pipeline.")
+    start_time = time.time()
+    outputs, scene = {}, {}
+    combined_np = np.array(image)
+    try:
+        # Detection
+        if run_det:
+            logger.info(f"Running detection with model: {det_model}")
+            load_start = time.time()
+            model = get_model("detection", DETECTION_MODEL_MAP[det_model], device="cpu")
+            logger.info(f"{det_model} detection model loaded in {time.time() - load_start:.2f} seconds.")
+            boxes = model.predict(image, conf_threshold=det_confidence)
+            overlay = model.draw(image, boxes)
+            combined_np = np.array(overlay)
+            buf = io.BytesIO()
+            overlay.save(buf, format="PNG")
+            outputs["detection.png"] = buf.getvalue()
+            scene["detection"] = boxes
+        # Segmentation
+        if run_seg:
+            logger.info(f"Running segmentation with model: {seg_model}")
+            load_start = time.time()
+            model = get_model("segmentation", SEGMENTATION_MODEL_MAP[seg_model], device="cpu")
+            logger.info(f"{seg_model} segmentation model loaded in {time.time() - load_start:.2f} seconds.")
+            mask = model.predict(image)
+            overlay = model.draw(image, mask, alpha=blend)
+            combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(overlay), blend, 0)
+            buf = io.BytesIO()
+            overlay.save(buf, format="PNG")
+            outputs["segmentation.png"] = buf.getvalue()
+            scene["segmentation"] = mask.tolist()
+        # Depth Estimation
+        if run_depth:
+            logger.info(f"Running depth estimation with model: {depth_model}")
+            load_start = time.time()
+            model = get_model("depth", DEPTH_MODEL_MAP[depth_model], device="cpu")
+            logger.info(f"{depth_model} depth model loaded in {time.time() - load_start:.2f} seconds.")
+            dmap = model.predict(image)
+            norm_dmap = ((dmap - dmap.min()) / (dmap.ptp()) * 255).astype(np.uint8)
+            d_pil = Image.fromarray(norm_dmap)
+            combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(d_pil.convert("RGB")), blend, 0)
+            buf = io.BytesIO()
+            d_pil.save(buf, format="PNG")
+            outputs["depth_map.png"] = buf.getvalue()
+            scene["depth"] = dmap.tolist()
+        # Final image overlay
+        final_img = Image.fromarray(combined_np)
+        buf = io.BytesIO()
+        final_img.save(buf, format="PNG")
+        outputs["scene_blueprint.png"] = buf.getvalue()
+        # Scene description
+        try:
+            scene_json = describe_scene(**scene)
+        except Exception as e:
+            logger.warning(f"describe_scene failed: {e}")
+            scene_json = {"error": str(e)}
+        telemetry = {
+        "session_id": generate_session_id(),
+        "runtime_sec": round(log_runtime(start_time), 2),
+        "used_models": {
+            "detection": det_model if run_det else None,
+            "segmentation": seg_model if run_seg else None,
+            "depth": depth_model if run_depth else None
+            }
+        }
+        scene_json["telemetry"] = telemetry
+        outputs["scene_description.json"] = json.dumps(scene_json, indent=2).encode("utf-8")
+        # ZIP file creation
+        zip_buf = io.BytesIO()
+        with zipfile.ZipFile(zip_buf, "w") as zipf:
+            for name, data in outputs.items():
+                zipf.writestr(name, data)
+        elapsed = log_runtime(start_time)
+        logger.info(f"Image processing completed in {elapsed:.2f} seconds.")
+        return final_img, scene_json, ("uvis_results.zip", zip_buf.getvalue())
+    except Exception as e:
+        logger.error(f"Error in processing pipeline: {e}")
+        return None, {"error": str(e)}, None
+# Main Handler
+def handle(mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend):
+    """
+    Master handler for resolving input and processing.
+    Returns outputs for Gradio interface.
+    """
+    session_id = generate_session_id()
+    logger.info(f"Session ID: {session_id} | Handler activated with mode: {mode}")
+    start_time = time.time()
+    media = resolve_input(mode, img, imgs, vid, url)
+    if not media:
+        return None, format_error("No valid input provided. Please check your upload or URL."), None
+    results = []
+    for single_media in media:
+        if isinstance(single_media, str):  # Video file
+            valid, err = validate_video(single_media)
+            if not valid:
+                return None, format_error(err), None
+            cap = cv2.VideoCapture(single_media)
+            ret, frame = cap.read()
+            cap.release()
+            if not ret:
+                return None, format_error("Failed to read video frame."), None
+            single_media = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        if isinstance(single_media, Image.Image):
+            valid, err = validate_image(single_media)
+            if not valid:
+                return None, format_error(err), None
+            try:
+                return process_image(single_media, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend)
+            except timeout_decorator.timeout_decorator.TimeoutError:
+                logger.error("Image processing timed out.")
+                return None, format_error("Processing timed out. Try a smaller image or simpler model."), None
+    logger.warning("Unsupported media type resolved.")
+    log_runtime(start_time)
+    return None, format_error("Invalid input. Please check your upload or URL."), None
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Unified Visual Intelligence System (UVIS)")
+    # Input Mode Selection
+    mode = gr.Radio(["Upload", "URL"], value="Upload", label="Input Mode")
+    img = gr.Image(type="pil", label="Upload Image")
+    imgs = gr.Gallery(label="Upload Multiple Images (Up to 5)").style(grid=[5], height="auto")
+    vid = gr.Video(label="Upload Video (<= 30s)")
+    url = gr.Textbox(label="URL (Image/Video)")
+    # Task Selection with parameters
+    with gr.Accordion("Object Detection Settings", open=False):
+        run_det = gr.Checkbox(label="Enable Object Detection")
+        det_model = gr.Dropdown(list(DETECTION_MODEL_MAP), label="Detection Model", visible=False)
+        det_confidence = gr.Slider(0.1, 1.0, 0.5, label="Detection Confidence Threshold", visible=False)
+    with gr.Accordion("Semantic Segmentation Settings", open=False):
+        run_seg = gr.Checkbox(label="Enable Segmentation")
+        seg_model = gr.Dropdown(list(SEGMENTATION_MODEL_MAP), label="Segmentation Model", visible=False)
+    with gr.Accordion("Depth Estimation Settings", open=False):
+        run_depth = gr.Checkbox(label="Enable Depth Estimation")
+        depth_model = gr.Dropdown(list(DEPTH_MODEL_MAP), label="Depth Model", visible=False)
+    blend = gr.Slider(0.0, 1.0, 0.5, label="Overlay Blend")
+    # Run Button
+    run = gr.Button("Run Analysis")
+    # Output Tabs
+    with gr.Tab("Scene JSON"):
+        json_out = gr.JSON()
+    with gr.Tab("Scene Blueprint"):
+        img_out = gr.Image()
+    with gr.Tab("Download"):
+        zip_out = gr.File()
+    # Attach Visibility Logic
+    run_det.change(toggle_visibility, run_det, [det_model, det_confidence])
+    run_seg.change(toggle_visibility, run_seg, [seg_model])
+    run_depth.change(toggle_visibility, run_depth, [depth_model])
+    # Button Click Event
+    run.click(
+        handle,
+        inputs=[mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend],
+        outputs=[img_out, json_out, zip_out]
+    )
+    # Footer Section
+    gr.Markdown("---")
+    gr.Markdown(
+        """
+        <div style='text-align: center; font-size: 14px;'>
+            Built by <b>Durga Deepak Valluri</b><br>
+            <a href="https://github.com/DurgaDeepakValluri/UVIS" target="_blank">GitHub</a> |
+            <a href="https://deecoded.io" target="_blank">Website</a> |
+            <a href="https://www.linkedin.com/in/durga-deepak-valluri" target="_blank">LinkedIn</a>
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+# Launch the Gradio App
+demo.launch()

assets/sample_images/Man_in_office.jpg ADDED Viewed

assets/sample_images/Street_in_Japan.jpg ADDED Viewed

Git LFS Details

SHA256: 4f0d7c53300b806c8ea726b72780766b4bea1226ba1bf6719d106c0e26547b65
Pointer size: 131 Bytes
Size of remote file: 281 kB

assets/ui/logo.png ADDED Viewed

Git LFS Details

SHA256: ff54ef43a6828cf3c8590d1327b47c24b67adf0e0f49425ef8c81950980745bc
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

core/describe_scene.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+def describe_scene(detection=None, segmentation=None, depth=None):
+    """
+    Generates a structured scene summary with metrics for detection, segmentation, and depth.
+    Args:
+        detection (list): List of detected objects with class names and bounding boxes.
+        segmentation (numpy.ndarray): Segmentation mask as a 2D numpy array.
+        depth (numpy.ndarray): Depth map as a 2D numpy array.
+    Returns:
+        dict: Structured scene description with metrics.
+    """
+    logger.info("Generating scene summary...")
+    description = {"scene_summary": {}}
+    # Detection Summary with Metrics
+    if detection:
+        logger.info("Adding detection results to scene summary.")
+        description["scene_summary"]["objects"] = detection
+        confidences = [obj.get("confidence", 0) for obj in detection]
+        description["scene_summary"]["detection_metrics"] = {
+            "objects_detected": len(detection),
+            "average_confidence": float(np.mean(confidences)) if confidences else 0.0
+        }
+    # Segmentation Summary with Coverage Metrics
+    if segmentation is not None:
+        logger.info("Summarizing segmentation coverage.")
+        unique, counts = np.unique(segmentation, return_counts=True)
+        total = segmentation.size
+        coverage = [
+            {"class_id": int(class_id), "coverage": f"{(count / total) * 100:.2f}%"}
+            for class_id, count in zip(unique, counts)
+        ]
+        dominant_class = max(coverage, key=lambda x: float(x["coverage"].strip('%')))
+        description["scene_summary"]["segmentation_summary"] = coverage
+        description["scene_summary"]["dominant_class"] = dominant_class
+    # Depth Summary with Metrics
+    if depth is not None:
+        logger.info("Summarizing depth information.")
+        mean_depth = float(np.mean(depth))
+        min_depth = float(np.min(depth))
+        max_depth = float(np.max(depth))
+        std_depth = float(np.std(depth))
+        description["scene_summary"]["depth_summary"] = {
+            "mean_depth": mean_depth,
+            "min_depth": min_depth,
+            "max_depth": max_depth,
+            "std_depth": std_depth
+        }
+    logger.info("Scene summary generation complete.")
+    return description

models/__init__.py ADDED Viewed

File without changes

models/depth/depth_estimator.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import torch
+import numpy as np
+from PIL import Image
+import logging
+from utils.model_downloader import download_model_if_needed
+# Configure Logger
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+class DepthEstimator:
+    """
+    Generalized Depth Estimation Model Wrapper for MiDaS and DPT models.
+    Supports: MiDaS v2.1 Small, MiDaS v2.1 Large, DPT Hybrid, DPT Large.
+    """
+    def __init__(self, model_key="midas_v21_small_256", weights_dir="models/depth/weights", device="cpu"):
+        """
+        Initialize the Depth Estimation model.
+        Args:
+            model_key (str): Model identifier as defined in model_downloader.py.
+            weights_dir (str): Directory to store/download model weights.
+            device (str): Inference device ("cpu" or "cuda").
+        """
+        weights_path = os.path.join(weights_dir, f"{model_key}.pt")
+        download_model_if_needed(model_key, weights_path)
+        logger.info(f"Loading Depth model '{model_key}' from MiDaS hub")
+        self.device = device
+        self.model_type = self._resolve_model_type(model_key)
+        self.midas = torch.hub.load("intel-isl/MiDaS", self.model_type).to(self.device).eval()
+        self.transform = self._resolve_transform()
+    def _resolve_model_type(self, model_key):
+        """
+        Maps model_key to MiDaS hub model type.
+        """
+        mapping = {
+            "midas_v21_small_256": "MiDaS_small",
+            "midas_v21_384": "MiDaS",
+            "dpt_hybrid_384": "DPT_Hybrid",
+            "dpt_large_384": "DPT_Large",
+            "dpt_swin2_large_384": "DPT_Large",  # fallback to DPT_Large if not explicitly supported
+            "dpt_beit_large_512": "DPT_Large",   # fallback to DPT_Large if not explicitly supported
+        }
+        return mapping.get(model_key, "MiDaS_small")
+    def _resolve_transform(self):
+        """
+        Returns the correct transformation pipeline based on model type.
+        """
+        transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
+        if self.model_type == "MiDaS_small":
+            return transforms.small_transform
+        else:
+            return transforms.default_transform
+    def predict(self, image: Image.Image):
+        """
+        Generates a depth map for the given image.
+        Args:
+            image (PIL.Image.Image): Input image.
+        Returns:
+            np.ndarray: Depth map as a 2D numpy array.
+        """
+        logger.info("Running depth estimation")
+        input_tensor = self.transform(image).to(self.device)
+        with torch.no_grad():
+            prediction = self.midas(input_tensor)
+            prediction = torch.nn.functional.interpolate(
+                prediction.unsqueeze(1),
+                size=image.size[::-1],
+                mode="bicubic",
+                align_corners=False,
+            ).squeeze()
+        depth_map = prediction.cpu().numpy()
+        logger.info("Depth estimation completed successfully")
+        return depth_map

models/detection/detector.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import numpy as np
+from PIL import Image, ImageDraw
+import logging
+from ultralytics import YOLO
+from utils.model_downloader import download_model_if_needed
+logger = logging.getLogger(__name__)
+class ObjectDetector:
+    """
+    Generalized Object Detection Wrapper for YOLOv5, YOLOv8, and future variants.
+    """
+    def __init__(self, model_key="yolov5n-seg", weights_dir="models/detection/weights", device="cpu"):
+        """
+        Initialize the Object Detection model.
+        Args:
+            model_key (str): Model identifier as defined in model_downloader.py.
+            weights_dir (str): Directory to store/download model weights.
+            device (str): Inference device ("cpu" or "cuda").
+        """
+        weights_path = os.path.join(weights_dir, f"{model_key}.pt")
+        download_model_if_needed(model_key, weights_path)
+        logger.info(f"Loading Object Detection model '{model_key}' from {weights_path}")
+        self.device = device
+        self.model = YOLO(weights_path)
+    def predict(self, image: Image.Image):
+        """
+        Run object detection.
+        Args:
+            image (PIL.Image.Image): Input image.
+        Returns:
+            List[Dict]: List of detected objects with class name, confidence, and bbox.
+        """
+        logger.info("Running object detection")
+        results = self.model(image)
+        detections = []
+        for r in results:
+            for box in r.boxes:
+                detections.append({
+                    "class_name": r.names[int(box.cls)],
+                    "confidence": float(box.conf),
+                    "bbox": box.xyxy[0].tolist()
+                })
+        logger.info(f"Detected {len(detections)} objects")
+        return detections
+    def draw(self, image: Image.Image, detections, alpha=0.5):
+        """
+        Draw bounding boxes on image.
+        Args:
+            image (PIL.Image.Image): Input image.
+            detections (List[Dict]): Detection results.
+            alpha (float): Blend strength.
+        Returns:
+            PIL.Image.Image: Image with bounding boxes drawn.
+        """
+        overlay = image.copy()
+        draw = ImageDraw.Draw(overlay)
+        for det in detections:
+            bbox = det["bbox"]
+            label = f'{det["class_name"]} {det["confidence"]:.2f}'
+            draw.rectangle(bbox, outline="red", width=2)
+            draw.text((bbox[0], bbox[1]), label, fill="red")
+        return Image.blend(image, overlay, alpha)

models/segmentation/segmenter.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import logging
+import torch
+from PIL import Image
+import numpy as np
+from torchvision import transforms
+from torchvision.models.segmentation import deeplabv3_resnet50
+from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor
+logger = logging.getLogger(__name__)
+class Segmenter:
+    """
+    Generalized Semantic Segmentation Wrapper for SegFormer and DeepLabV3.
+    """
+    def __init__(self, model_key="nvidia/segformer-b0-finetuned-ade-512-512", device="cpu"):
+        """
+        Initialize the segmentation model.
+        Args:
+            model_key (str): Model identifier, e.g., Hugging Face model id or 'deeplabv3_resnet50'.
+            device (str): Inference device ("cpu" or "cuda").
+        """
+        logger.info(f"Initializing segmenter with model: {model_key}")
+        self.device = device
+        self.model_key = model_key
+        self.model, self.processor = self._load_model()
+    def _load_model(self):
+        """
+        Load the segmentation model and processor.
+        Returns:
+            Tuple[torch.nn.Module, Optional[Processor]]
+        """
+        if "segformer" in self.model_key:
+            model = SegformerForSemanticSegmentation.from_pretrained(self.model_key).to(self.device)
+            processor = SegformerFeatureExtractor.from_pretrained(self.model_key)
+            return model, processor
+        elif self.model_key == "deeplabv3_resnet50":
+            model = deeplabv3_resnet50(pretrained=True).to(self.device).eval()
+            return model, None
+        else:
+            raise ValueError(f"Unsupported model key: {self.model_key}")
+    def predict(self, image: Image.Image):
+        """
+        Perform segmentation on the input image.
+        Args:
+            image (PIL.Image.Image): Input image.
+        Returns:
+            np.ndarray: Segmentation mask.
+        """
+        logger.info("Running segmentation")
+        if "segformer" in self.model_key:
+            inputs = self.processor(images=image, return_tensors="pt").to(self.device)
+            outputs = self.model(**inputs)
+            mask = outputs.logits.argmax(dim=1).squeeze().cpu().numpy()
+            return mask
+        elif self.model_key == "deeplabv3_resnet50":
+            transform = transforms.Compose([
+                transforms.ToTensor(),
+            ])
+            inputs = transform(image).unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(inputs)["out"]
+                mask = outputs.argmax(1).squeeze().cpu().numpy()
+                return mask
+    def draw(self, image: Image.Image, mask: np.ndarray, alpha=0.5):
+        """
+        Overlay the segmentation mask on the input image.
+        Args:
+            image (PIL.Image.Image): Original image.
+            mask (np.ndarray): Segmentation mask.
+            alpha (float): Blend strength.
+        Returns:
+            PIL.Image.Image: Image with mask overlay.
+        """
+        logger.info("Drawing segmentation overlay")
+        mask_img = Image.fromarray((mask * 255 / mask.max()).astype(np.uint8)).convert("L").resize(image.size)
+        mask_colored = Image.merge("RGB", (mask_img, mask_img, mask_img))
+        return Image.blend(image, mask_colored, alpha)

registry.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import logging
+# Configure Logger
+logger = logging.getLogger(__name__)
+def get_model(task: str, model_key: str, device="cpu"):
+    """
+    Dynamically retrieves the model instance based on the task and model_key.
+    Args:
+        task (str): One of "detection", "segmentation", or "depth".
+        model_key (str): Model identifier or variant.
+        device (str): Device to run inference on ("cpu" or "cuda").
+    Returns:
+        object: Initialized model ready for inference.
+    Raises:
+        ValueError: If task is unsupported or model loading fails.
+    """
+    logger.info(f"Request received to load model '{model_key}' for task '{task}' on device '{device}'")
+    try:
+        if task == "detection":
+            from models.detection.detector import ObjectDetector
+            return ObjectDetector(model_key=model_key, device=device)
+        elif task == "segmentation":
+            from models.segmentation.segmenter import Segmenter
+            return Segmenter(model_key=model_key, device=device)
+        elif task == "depth":
+            from models.depth.depth_estimator import DepthEstimator
+            return DepthEstimator(model_key=model_key, device=device)
+        else:
+            error_msg = f"Unsupported task '{task}'. Valid options are: 'detection', 'segmentation', 'depth'."
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+    except Exception as e:
+        logger.error(f"Error while loading model '{model_key}' for task '{task}': {e}")
+        raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# Core Libraries
+gradio>=3.50  # Web interface for the application
+torch>=2.0  # PyTorch for deep learning models
+torchvision>=0.15  # TorchVision for pre-trained models and utilities
+ultralytics>=8.0  # YOLO models for object detection
+opencv-python>=4.7  # OpenCV for video and image processing
+# Utility Libraries
+numpy>=1.21  # Numerical computations
+Pillow>=9.0  # Image processing
+requests>=2.28  # HTTP requests for fetching media
+timeout-decorator>=0.5.0  # Timeout handling for long-running tasks
+tqdm>=4.64  # Progress bars for iterative tasks
+# Hugging Face Support
+transformers>=4.30  # Hugging Face Transformers for SegFormer models
+sentencepiece  # Tokenization for Hugging Face models
+huggingface-hub>=0.15  # Model hub integration for Hugging Face
+# Data Handling
+pandas>=1.3  # Data manipulation and structured data handling
+scipy>=1.7  # Scientific computing for advanced numerical

utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import logging
+logger = logging.getLogger(__name__)
+def ensure_dir(directory):
+    """
+    Ensures the given directory exists. Creates it if it does not.
+    Args:
+        directory (str): The directory path to check or create.
+    """
+    if not os.path.exists(directory):
+        logger.info(f"Creating directory: {directory}")
+        os.makedirs(directory)
+    else:
+        logger.info(f"Directory already exists: {directory}")

utils/math_utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import numpy as np
+import logging
+logger = logging.getLogger(__name__)
+def normalize_array(arr):
+    """
+    Normalizes a numpy array to the range [0, 1].
+    Args:
+        arr (numpy.ndarray): The array to normalize.
+    Returns:
+        numpy.ndarray: The normalized array.
+    """
+    logger.info("Normalizing array to range [0, 1].")
+    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))

utils/model_downloader.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import urllib.request
+import logging
+# Configure Logger
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+# Model URLs for downloading if not present locally
+MODEL_URLS = {
+    "dpt_hybrid_384": "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
+    "midas_v21_small_256": "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
+    "yolov5n-seg": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n-seg.pt",
+    "yolov5s-seg": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt",
+}
+def download_model_if_needed(model_key: str, save_path: str):
+    """
+    Downloads a model file if it does not already exist.
+    Args:
+        model_key (str): The key representing the model in MODEL_URLS.
+        save_path (str): The local path where the model should be saved.
+    Raises:
+        ValueError: If the model_key does not exist in MODEL_URLS.
+    """
+    url = MODEL_URLS.get(model_key)
+    if not url:
+        logger.error(f"Model key '{model_key}' is not defined in MODEL_URLS.")
+        raise ValueError(f"No URL configured for model key: {model_key}")
+    if os.path.exists(save_path):
+        logger.info(f"Model '{model_key}' already exists at '{save_path}'. Skipping download.")
+        return
+    try:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        logger.info(f"Downloading '{model_key}' from '{url}' to '{save_path}'")
+        urllib.request.urlretrieve(url, save_path)
+        logger.info(f"Successfully downloaded '{model_key}' to '{save_path}'")
+    except Exception as e:
+        logger.error(f"Failed to download '{model_key}': {e}")
+        raise

utils/video_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import cv2
+import os
+import tempfile
+import logging
+from typing import List
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def extract_frames(video_path: str, skip: int = 1) -> List:
+    """
+    Extract frames from a video.
+    Args:
+        video_path (str): Path to the video file.
+        skip (int): Number of frames to skip between extractions.
+    Returns:
+        List of BGR frames as numpy arrays.
+    """
+    logger.info(f"Extracting frames from video: {video_path}")
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % skip == 0:
+            frames.append(frame)
+        frame_count += 1
+    cap.release()
+    logger.info(f"Extracted {len(frames)} frames")
+    return frames