DurgaDeepak commited on
Commit
b23251f
·
verified ·
1 Parent(s): eb90293

Main Commit

Browse files

Files from private github repo

.gitattributes CHANGED
@@ -1,35 +1,7 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
3
+ *.pt filter=lfs diff=lfs merge=lfs -text
4
+ .venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
5
+ .venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
6
+ assets/sample_images/Street_in_Japan.jpg filter=lfs diff=lfs merge=lfs -text
7
+ assets/ui/logo.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.venv/Scripts/python.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beefaea165effa6069ba50bdd4d3a5cb7bcd6173629dd879af45985129e9038b
3
+ size 242920
.venv/Scripts/pythonw.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f67a7ae6f44fa2c2892ad83757baaf18b5b3be9f6becac66d6d6fea41c19819
3
+ size 232688
.venv/pyvenv.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ home = D:\
2
+ include-system-site-packages = false
3
+ version = 3.10.0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base Python image
2
+ FROM python:3.10-slim
3
+
4
+ # Install OS dependencies
5
+ RUN apt-get update && apt-get install -y \
6
+ libgl1-mesa-glx \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Set working directory
10
+ WORKDIR /app
11
+
12
+ # Copy all files
13
+ COPY . .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Expose port (Streamlit default)
19
+ EXPOSE 8501
20
+
21
+ # Run Streamlit app
22
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
23
+
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,14 +1,94 @@
1
- ---
2
- title: UVIS
3
- emoji: 📚
4
- colorFrom: green
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.29.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Unified Visual Intelligence System
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # UVIS - Unified Visual Intelligence System
3
+
4
+ ### A Lightweight Web-Based Visual Perception Demo
5
+
6
+ > **Try it online**: [uvis.deecoded.io](https://uvis.deecoded.io)
7
+ > **GitHub**: [github.com/DurgaDeepakValluri/UVIS](https://github.com/DurgaDeepakValluri/UVIS)
8
+
9
+ ---
10
+
11
+ ## Overview
12
+
13
+ **UVIS** (Unified Visual Intelligence System) is a **lightweight, web-based visual perception demo**, originally conceptualized as a **spin-off while building Percepta**—a larger modular perception framework.
14
+
15
+ The goal of UVIS is to make **scene understanding tools more accessible**, allowing anyone to try object detection, semantic segmentation, and depth estimation through a clean web interface, without requiring local setup.
16
+
17
+ UVIS currently runs on **[Render.com](https://www.render.com)'s Free Tier**, using **lightweight models** to ensure the experience remains stable on limited resources.
18
+
19
+ ---
20
+
21
+ ## Key Features
22
+
23
+ | Capability | Description |
24
+ | ---------------------------- | ----------------------------------------------------------------------------------- |
25
+ | 🟢 **Object Detection** | YOLOv5-Nano & YOLOv5-Small for fast, low-resource detection. |
26
+ | 🟢 **Semantic Segmentation** | SegFormer-B0 and DeepLabV3-ResNet50 for general-purpose scenes. |
27
+ | 🟢 **Depth Estimation** | MiDaS Small & DPT Lite for per-pixel depth estimation. |
28
+ | 🖼️ **Scene Blueprint** | Unified overlay combining all selected tasks. |
29
+ | 📊 **Scene Metrics** | Scene complexity scoring and agent-friendly summaries. |
30
+ | 📦 **Downloadable Results** | JSON, overlay images, and ZIP bundles. |
31
+ | 🌐 **Web-First Design** | No installation needed—hosted live at [uvis.deecoded.io](https://uvis.deecoded.io). |
32
+ | 🛠️ **Open Source** | Contribution-friendly, easy to extend and improve. |
33
+
34
+ ---
35
+
36
+ ### Current Limitations & Roadmap
37
+
38
+ UVIS is designed for **lightweight demos** on **free-tier hosting**, which means:
39
+
40
+ * Models are optimized for speed and minimal compute.
41
+ * Only **image input** is supported at this time.
42
+
43
+ > As the project grows and higher hosting tiers become available, the roadmap includes:
44
+ >
45
+ > * **Video input support**
46
+ > * **Lightweight SLAM**
47
+ > * **Natural language scene descriptions**
48
+ > * **Higher-capacity, more accurate models**
49
+
50
+ ---
51
+
52
+ ## Architecture Highlights
53
+
54
+ * **Modular Python Backend with Model Registry**
55
+ * **Streamlit-Based Interactive Web UI**
56
+ * **HuggingFace Transformers & TorchVision Integration**
57
+ * **Lightweight Model Support (Render-Compatible)**
58
+ * **Structured JSON Output for AI Agents**
59
+ * **Robust Error Handling and Logging**
60
+
61
+ ---
62
+
63
+ ## 🤝 Contributing
64
+
65
+ UVIS is **open-source** and welcomes contributions.
66
+ You can:
67
+
68
+ * Suggest new features
69
+ * Improve the web interface
70
+ * Extend perception tasks
71
+ * Report issues or bugs
72
+
73
+ ### 💻 **Clone and Run Locally**
74
+
75
+ ```bash
76
+ git clone https://github.com/DurgaDeepakValluri/UVIS.git
77
+ cd UVIS
78
+ pip install -r requirements.txt
79
+ ```
80
+
81
+ ---
82
+
83
+ ## 🌐 Live Demo
84
+
85
+ > **Explore it online at [uvis.deecoded.io](https://uvis.deecoded.io)**
86
+ > Upload an image, select your tasks, and view the results—all in your browser.
87
+
88
+ ---
89
+
90
+ ## 📝 License
91
+
92
+ Apache 2.0 License. Free for personal and commercial use with attribution.
93
+ © 2025 Durga Deepak Valluri
94
+
app.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # UVIS - Gradio App with Upload, URL & Video Support
2
+ """
3
+ This script launches the UVIS (Unified Visual Intelligence System) as a Gradio Web App.
4
+ Supports image, video, and URL-based media inputs for detection, segmentation, and depth estimation.
5
+ Outputs include scene blueprint, structured JSON, and downloadable results.
6
+ """
7
+
8
+ import gradio as gr
9
+ from PIL import Image
10
+ import numpy as np
11
+ import os
12
+ import io
13
+ import zipfile
14
+ import json
15
+ import tempfile
16
+ import logging
17
+ import cv2
18
+ import requests
19
+ from urllib.parse import urlparse
20
+ from registry import get_model
21
+ from core.describe_scene import describe_scene
22
+ import uuid
23
+ import time
24
+ import timeout_decorator
25
+ import socket
26
+ import ipaddress
27
+
28
+ # Setup logging
29
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Model mappings
33
+ DETECTION_MODEL_MAP = {
34
+ "YOLOv5-Nano": "yolov5n-seg",
35
+ "YOLOv5-Small": "yolov5s-seg",
36
+ "YOLOv8-Small": "yolov8s",
37
+ "YOLOv8-Large": "yolov8l",
38
+ "RT-DETR": "rtdetr" # For future support
39
+ }
40
+
41
+ SEGMENTATION_MODEL_MAP = {
42
+ "SegFormer-B0": "nvidia/segformer-b0-finetuned-ade-512-512",
43
+ "SegFormer-B5": "nvidia/segformer-b5-finetuned-ade-512-512",
44
+ "DeepLabV3-ResNet50": "deeplabv3_resnet50"
45
+ }
46
+
47
+ DEPTH_MODEL_MAP = {
48
+ "MiDaS v21 Small 256": "midas_v21_small_256",
49
+ "MiDaS v21 384": "midas_v21_384",
50
+ "DPT Hybrid 384": "dpt_hybrid_384",
51
+ "DPT Swin2 Large 384": "dpt_swin2_large_384",
52
+ "DPT Beit Large 512": "dpt_beit_large_512"
53
+ }
54
+
55
+ # Resource Limits
56
+ MAX_IMAGE_MB = 5
57
+ MAX_IMAGE_RES = (1920, 1080)
58
+ MAX_VIDEO_MB = 50
59
+ MAX_VIDEO_DURATION = 30 # seconds
60
+
61
+ # Utility Functions
62
+ def format_error(message):
63
+ """Formats error messages for consistent user feedback."""
64
+ return {"error": message}
65
+
66
+ def toggle_visibility(show, *components):
67
+ """Toggles visibility for multiple Gradio components."""
68
+ return [gr.update(visible=show) for _ in components]
69
+
70
+ def generate_session_id():
71
+ """Generates a unique session ID for tracking inputs."""
72
+ return str(uuid.uuid4())
73
+
74
+ def log_runtime(start_time):
75
+ """Logs the runtime of a process."""
76
+ elapsed_time = time.time() - start_time
77
+ logger.info(f"Process completed in {elapsed_time:.2f} seconds.")
78
+ return elapsed_time
79
+
80
+ def is_public_ip(url):
81
+ """
82
+ Checks whether the resolved IP address of a URL is public (non-local).
83
+ Prevents SSRF by blocking internal addresses like 127.0.0.1 or 192.168.x.x.
84
+ """
85
+ try:
86
+ hostname = urlparse(url).hostname
87
+ ip = socket.gethostbyname(hostname)
88
+ ip_obj = ipaddress.ip_address(ip)
89
+ return ip_obj.is_global # Only allow globally routable IPs
90
+ except Exception as e:
91
+ logger.warning(f"URL IP validation failed: {e}")
92
+ return False
93
+
94
+
95
+ def fetch_media_from_url(url):
96
+ """
97
+ Downloads media from a URL. Supports images and videos.
98
+ Returns PIL.Image or video file path.
99
+ """
100
+ logger.info(f"Fetching media from URL: {url}")
101
+ if not is_public_ip(url):
102
+ logger.warning("Blocked non-public URL request (possible SSRF).")
103
+ return None
104
+
105
+ try:
106
+ parsed_url = urlparse(url)
107
+ ext = os.path.splitext(parsed_url.path)[-1].lower()
108
+ headers = {"User-Agent": "Mozilla/5.0"}
109
+ r = requests.get(url, headers=headers, timeout=10)
110
+
111
+ if r.status_code != 200 or len(r.content) > 50 * 1024 * 1024:
112
+ logger.warning(f"Download failed or file too large.")
113
+ return None
114
+
115
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
116
+ tmp_file.write(r.content)
117
+ tmp_file.close()
118
+
119
+ if ext in [".jpg", ".jpeg", ".png"]:
120
+ return Image.open(tmp_file.name).convert("RGB")
121
+ elif ext in [".mp4", ".avi", ".mov"]:
122
+ return tmp_file.name
123
+ else:
124
+ logger.warning("Unsupported file type from URL.")
125
+ return None
126
+ except Exception as e:
127
+ logger.error(f"URL fetch failed: {e}")
128
+ return None
129
+
130
+ # Input Validation Functions
131
+ def validate_image(img):
132
+ """
133
+ Validates the uploaded image based on size and resolution limits.
134
+
135
+ Args:
136
+ img (PIL.Image.Image): Image to validate.
137
+
138
+ Returns:
139
+ Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
140
+ """
141
+ logger.info("Validating uploaded image.")
142
+ try:
143
+ buffer = io.BytesIO()
144
+ img.save(buffer, format="PNG")
145
+ size_mb = len(buffer.getvalue()) / (1024 * 1024)
146
+
147
+ if size_mb > MAX_IMAGE_MB:
148
+ logger.warning("Image exceeds size limit of 5MB.")
149
+ return False, "Image exceeds 5MB limit."
150
+
151
+ if img.width > MAX_IMAGE_RES[0] or img.height > MAX_IMAGE_RES[1]:
152
+ logger.warning("Image resolution exceeds 1920x1080.")
153
+ return False, "Image resolution exceeds 1920x1080."
154
+
155
+ logger.info("Image validation passed.")
156
+ return True, None
157
+ except Exception as e:
158
+ logger.error(f"Error validating image: {e}")
159
+ return False, str(e)
160
+
161
+ def validate_video(path):
162
+ """
163
+ Validates the uploaded video based on size and duration limits.
164
+
165
+ Args:
166
+ path (str): Path to the video file.
167
+
168
+ Returns:
169
+ Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
170
+ """
171
+ logger.info(f"Validating video file at: {path}")
172
+ try:
173
+ size_mb = os.path.getsize(path) / (1024 * 1024)
174
+ if size_mb > MAX_VIDEO_MB:
175
+ logger.warning("Video exceeds size limit of 50MB.")
176
+ return False, "Video exceeds 50MB limit."
177
+
178
+ cap = cv2.VideoCapture(path)
179
+ fps = cap.get(cv2.CAP_PROP_FPS)
180
+ frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
181
+ duration = frames / fps if fps else 0
182
+ cap.release()
183
+
184
+ if duration > MAX_VIDEO_DURATION:
185
+ logger.warning("Video exceeds 30 seconds duration limit.")
186
+ return False, "Video exceeds 30 seconds duration limit."
187
+
188
+ logger.info("Video validation passed.")
189
+ return True, None
190
+ except Exception as e:
191
+ logger.error(f"Error validating video: {e}")
192
+ return False, str(e)
193
+
194
+ # Input Resolution
195
+ def resolve_input(mode, uploaded_img, uploaded_imgs, uploaded_vid, url):
196
+ """
197
+ Resolves the input source based on user selection.
198
+ Supports single image, multiple images, video, or URL-based media.
199
+
200
+ Args:
201
+ mode (str): Input mode - 'Upload' or 'URL'.
202
+ uploaded_img (PIL.Image.Image): Single uploaded image.
203
+ uploaded_imgs (List[PIL.Image.Image]): List of uploaded images (batch).
204
+ uploaded_vid (str): Uploaded video file path.
205
+ url (str): URL pointing to media content.
206
+
207
+ Returns:
208
+ List[Union[PIL.Image.Image, str, None]]: A list of media items to process.
209
+ """
210
+ logger.info(f"Resolving input based on mode: {mode}")
211
+ try:
212
+ if mode == "Upload":
213
+ # Prefer batch if provided
214
+ if uploaded_imgs and len(uploaded_imgs) > 0:
215
+ return uploaded_imgs
216
+ elif uploaded_img:
217
+ return [uploaded_img]
218
+ elif uploaded_vid:
219
+ return [uploaded_vid]
220
+ else:
221
+ logger.warning("No valid upload provided.")
222
+ return None
223
+
224
+ elif mode == "URL":
225
+ media_from_url = fetch_media_from_url(url)
226
+ if media_from_url:
227
+ return [media_from_url]
228
+ else:
229
+ logger.warning("Failed to fetch valid media from URL.")
230
+ return None
231
+
232
+ else:
233
+ logger.warning("Invalid input mode selected.")
234
+ return None
235
+
236
+ except Exception as e:
237
+ logger.error(f"Error resolving input: {e}")
238
+ return None
239
+
240
+ @timeout_decorator.timeout(35, use_signals=False) # 35 sec limit per image
241
+ def process_image(
242
+ image: Image.Image,
243
+ run_det: bool,
244
+ det_model: str,
245
+ det_confidence: float,
246
+ run_seg: bool,
247
+ seg_model: str,
248
+ run_depth: bool,
249
+ depth_model: str,
250
+ blend: float
251
+ ):
252
+ """
253
+ Runs selected perception tasks on the input image and packages results.
254
+
255
+ Args:
256
+ image (PIL.Image): Input image.
257
+ run_det (bool): Run object detection.
258
+ det_model (str): Detection model key.
259
+ det_confidence (float): Detection confidence threshold.
260
+ run_seg (bool): Run segmentation.
261
+ seg_model (str): Segmentation model key.
262
+ run_depth (bool): Run depth estimation.
263
+ depth_model (str): Depth model key.
264
+ blend (float): Overlay blend alpha (0.0 - 1.0).
265
+
266
+ Returns:
267
+ Tuple[Image, dict, Tuple[str, bytes]]: Final image, scene JSON, and downloadable ZIP.
268
+ """
269
+ logger.info("Starting image processing pipeline.")
270
+ start_time = time.time()
271
+ outputs, scene = {}, {}
272
+ combined_np = np.array(image)
273
+
274
+ try:
275
+ # Detection
276
+ if run_det:
277
+ logger.info(f"Running detection with model: {det_model}")
278
+ load_start = time.time()
279
+ model = get_model("detection", DETECTION_MODEL_MAP[det_model], device="cpu")
280
+ logger.info(f"{det_model} detection model loaded in {time.time() - load_start:.2f} seconds.")
281
+ boxes = model.predict(image, conf_threshold=det_confidence)
282
+ overlay = model.draw(image, boxes)
283
+ combined_np = np.array(overlay)
284
+ buf = io.BytesIO()
285
+ overlay.save(buf, format="PNG")
286
+ outputs["detection.png"] = buf.getvalue()
287
+ scene["detection"] = boxes
288
+
289
+ # Segmentation
290
+ if run_seg:
291
+ logger.info(f"Running segmentation with model: {seg_model}")
292
+ load_start = time.time()
293
+ model = get_model("segmentation", SEGMENTATION_MODEL_MAP[seg_model], device="cpu")
294
+ logger.info(f"{seg_model} segmentation model loaded in {time.time() - load_start:.2f} seconds.")
295
+ mask = model.predict(image)
296
+ overlay = model.draw(image, mask, alpha=blend)
297
+ combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(overlay), blend, 0)
298
+ buf = io.BytesIO()
299
+ overlay.save(buf, format="PNG")
300
+ outputs["segmentation.png"] = buf.getvalue()
301
+ scene["segmentation"] = mask.tolist()
302
+
303
+ # Depth Estimation
304
+ if run_depth:
305
+ logger.info(f"Running depth estimation with model: {depth_model}")
306
+ load_start = time.time()
307
+ model = get_model("depth", DEPTH_MODEL_MAP[depth_model], device="cpu")
308
+ logger.info(f"{depth_model} depth model loaded in {time.time() - load_start:.2f} seconds.")
309
+ dmap = model.predict(image)
310
+ norm_dmap = ((dmap - dmap.min()) / (dmap.ptp()) * 255).astype(np.uint8)
311
+ d_pil = Image.fromarray(norm_dmap)
312
+ combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(d_pil.convert("RGB")), blend, 0)
313
+ buf = io.BytesIO()
314
+ d_pil.save(buf, format="PNG")
315
+ outputs["depth_map.png"] = buf.getvalue()
316
+ scene["depth"] = dmap.tolist()
317
+
318
+ # Final image overlay
319
+ final_img = Image.fromarray(combined_np)
320
+ buf = io.BytesIO()
321
+ final_img.save(buf, format="PNG")
322
+ outputs["scene_blueprint.png"] = buf.getvalue()
323
+
324
+ # Scene description
325
+ try:
326
+ scene_json = describe_scene(**scene)
327
+ except Exception as e:
328
+ logger.warning(f"describe_scene failed: {e}")
329
+ scene_json = {"error": str(e)}
330
+ telemetry = {
331
+ "session_id": generate_session_id(),
332
+ "runtime_sec": round(log_runtime(start_time), 2),
333
+ "used_models": {
334
+ "detection": det_model if run_det else None,
335
+ "segmentation": seg_model if run_seg else None,
336
+ "depth": depth_model if run_depth else None
337
+ }
338
+ }
339
+ scene_json["telemetry"] = telemetry
340
+
341
+ outputs["scene_description.json"] = json.dumps(scene_json, indent=2).encode("utf-8")
342
+
343
+ # ZIP file creation
344
+ zip_buf = io.BytesIO()
345
+ with zipfile.ZipFile(zip_buf, "w") as zipf:
346
+ for name, data in outputs.items():
347
+ zipf.writestr(name, data)
348
+
349
+ elapsed = log_runtime(start_time)
350
+ logger.info(f"Image processing completed in {elapsed:.2f} seconds.")
351
+
352
+ return final_img, scene_json, ("uvis_results.zip", zip_buf.getvalue())
353
+
354
+ except Exception as e:
355
+ logger.error(f"Error in processing pipeline: {e}")
356
+ return None, {"error": str(e)}, None
357
+
358
+ # Main Handler
359
+ def handle(mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend):
360
+ """
361
+ Master handler for resolving input and processing.
362
+ Returns outputs for Gradio interface.
363
+ """
364
+ session_id = generate_session_id()
365
+ logger.info(f"Session ID: {session_id} | Handler activated with mode: {mode}")
366
+ start_time = time.time()
367
+
368
+ media = resolve_input(mode, img, imgs, vid, url)
369
+ if not media:
370
+ return None, format_error("No valid input provided. Please check your upload or URL."), None
371
+
372
+ results = []
373
+ for single_media in media:
374
+ if isinstance(single_media, str): # Video file
375
+ valid, err = validate_video(single_media)
376
+ if not valid:
377
+ return None, format_error(err), None
378
+ cap = cv2.VideoCapture(single_media)
379
+ ret, frame = cap.read()
380
+ cap.release()
381
+ if not ret:
382
+ return None, format_error("Failed to read video frame."), None
383
+ single_media = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
384
+
385
+ if isinstance(single_media, Image.Image):
386
+ valid, err = validate_image(single_media)
387
+ if not valid:
388
+ return None, format_error(err), None
389
+ try:
390
+ return process_image(single_media, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend)
391
+ except timeout_decorator.timeout_decorator.TimeoutError:
392
+ logger.error("Image processing timed out.")
393
+ return None, format_error("Processing timed out. Try a smaller image or simpler model."), None
394
+
395
+ logger.warning("Unsupported media type resolved.")
396
+ log_runtime(start_time)
397
+ return None, format_error("Invalid input. Please check your upload or URL."), None
398
+
399
+ # Gradio Interface
400
+ with gr.Blocks() as demo:
401
+ gr.Markdown("## Unified Visual Intelligence System (UVIS)")
402
+
403
+ # Input Mode Selection
404
+ mode = gr.Radio(["Upload", "URL"], value="Upload", label="Input Mode")
405
+ img = gr.Image(type="pil", label="Upload Image")
406
+ imgs = gr.Gallery(label="Upload Multiple Images (Up to 5)").style(grid=[5], height="auto")
407
+ vid = gr.Video(label="Upload Video (<= 30s)")
408
+ url = gr.Textbox(label="URL (Image/Video)")
409
+
410
+ # Task Selection with parameters
411
+ with gr.Accordion("Object Detection Settings", open=False):
412
+ run_det = gr.Checkbox(label="Enable Object Detection")
413
+ det_model = gr.Dropdown(list(DETECTION_MODEL_MAP), label="Detection Model", visible=False)
414
+ det_confidence = gr.Slider(0.1, 1.0, 0.5, label="Detection Confidence Threshold", visible=False)
415
+
416
+ with gr.Accordion("Semantic Segmentation Settings", open=False):
417
+ run_seg = gr.Checkbox(label="Enable Segmentation")
418
+ seg_model = gr.Dropdown(list(SEGMENTATION_MODEL_MAP), label="Segmentation Model", visible=False)
419
+
420
+ with gr.Accordion("Depth Estimation Settings", open=False):
421
+ run_depth = gr.Checkbox(label="Enable Depth Estimation")
422
+ depth_model = gr.Dropdown(list(DEPTH_MODEL_MAP), label="Depth Model", visible=False)
423
+
424
+ blend = gr.Slider(0.0, 1.0, 0.5, label="Overlay Blend")
425
+
426
+ # Run Button
427
+ run = gr.Button("Run Analysis")
428
+
429
+ # Output Tabs
430
+ with gr.Tab("Scene JSON"):
431
+ json_out = gr.JSON()
432
+ with gr.Tab("Scene Blueprint"):
433
+ img_out = gr.Image()
434
+ with gr.Tab("Download"):
435
+ zip_out = gr.File()
436
+
437
+ # Attach Visibility Logic
438
+ run_det.change(toggle_visibility, run_det, [det_model, det_confidence])
439
+ run_seg.change(toggle_visibility, run_seg, [seg_model])
440
+ run_depth.change(toggle_visibility, run_depth, [depth_model])
441
+
442
+ # Button Click Event
443
+ run.click(
444
+ handle,
445
+ inputs=[mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend],
446
+ outputs=[img_out, json_out, zip_out]
447
+ )
448
+
449
+ # Footer Section
450
+ gr.Markdown("---")
451
+ gr.Markdown(
452
+ """
453
+ <div style='text-align: center; font-size: 14px;'>
454
+ Built by <b>Durga Deepak Valluri</b><br>
455
+ <a href="https://github.com/DurgaDeepakValluri/UVIS" target="_blank">GitHub</a> |
456
+ <a href="https://deecoded.io" target="_blank">Website</a> |
457
+ <a href="https://www.linkedin.com/in/durga-deepak-valluri" target="_blank">LinkedIn</a>
458
+ </div>
459
+ """,
460
+ unsafe_allow_html=True
461
+ )
462
+
463
+ # Launch the Gradio App
464
+ demo.launch()
assets/sample_images/Man_in_office.jpg ADDED
assets/sample_images/Street_in_Japan.jpg ADDED

Git LFS Details

  • SHA256: 4f0d7c53300b806c8ea726b72780766b4bea1226ba1bf6719d106c0e26547b65
  • Pointer size: 131 Bytes
  • Size of remote file: 281 kB
assets/ui/logo.png ADDED

Git LFS Details

  • SHA256: ff54ef43a6828cf3c8590d1327b47c24b67adf0e0f49425ef8c81950980745bc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB
core/describe_scene.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def describe_scene(detection=None, segmentation=None, depth=None):
7
+ """
8
+ Generates a structured scene summary with metrics for detection, segmentation, and depth.
9
+
10
+ Args:
11
+ detection (list): List of detected objects with class names and bounding boxes.
12
+ segmentation (numpy.ndarray): Segmentation mask as a 2D numpy array.
13
+ depth (numpy.ndarray): Depth map as a 2D numpy array.
14
+
15
+ Returns:
16
+ dict: Structured scene description with metrics.
17
+ """
18
+ logger.info("Generating scene summary...")
19
+ description = {"scene_summary": {}}
20
+
21
+ # Detection Summary with Metrics
22
+ if detection:
23
+ logger.info("Adding detection results to scene summary.")
24
+ description["scene_summary"]["objects"] = detection
25
+ confidences = [obj.get("confidence", 0) for obj in detection]
26
+ description["scene_summary"]["detection_metrics"] = {
27
+ "objects_detected": len(detection),
28
+ "average_confidence": float(np.mean(confidences)) if confidences else 0.0
29
+ }
30
+
31
+ # Segmentation Summary with Coverage Metrics
32
+ if segmentation is not None:
33
+ logger.info("Summarizing segmentation coverage.")
34
+ unique, counts = np.unique(segmentation, return_counts=True)
35
+ total = segmentation.size
36
+ coverage = [
37
+ {"class_id": int(class_id), "coverage": f"{(count / total) * 100:.2f}%"}
38
+ for class_id, count in zip(unique, counts)
39
+ ]
40
+ dominant_class = max(coverage, key=lambda x: float(x["coverage"].strip('%')))
41
+ description["scene_summary"]["segmentation_summary"] = coverage
42
+ description["scene_summary"]["dominant_class"] = dominant_class
43
+
44
+ # Depth Summary with Metrics
45
+ if depth is not None:
46
+ logger.info("Summarizing depth information.")
47
+ mean_depth = float(np.mean(depth))
48
+ min_depth = float(np.min(depth))
49
+ max_depth = float(np.max(depth))
50
+ std_depth = float(np.std(depth))
51
+ description["scene_summary"]["depth_summary"] = {
52
+ "mean_depth": mean_depth,
53
+ "min_depth": min_depth,
54
+ "max_depth": max_depth,
55
+ "std_depth": std_depth
56
+ }
57
+
58
+ logger.info("Scene summary generation complete.")
59
+ return description
models/__init__.py ADDED
File without changes
models/depth/depth_estimator.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from PIL import Image
5
+ import logging
6
+ from utils.model_downloader import download_model_if_needed
7
+
8
+ # Configure Logger
9
+ logger = logging.getLogger(__name__)
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+
12
+
13
+ class DepthEstimator:
14
+ """
15
+ Generalized Depth Estimation Model Wrapper for MiDaS and DPT models.
16
+ Supports: MiDaS v2.1 Small, MiDaS v2.1 Large, DPT Hybrid, DPT Large.
17
+ """
18
+
19
+ def __init__(self, model_key="midas_v21_small_256", weights_dir="models/depth/weights", device="cpu"):
20
+ """
21
+ Initialize the Depth Estimation model.
22
+
23
+ Args:
24
+ model_key (str): Model identifier as defined in model_downloader.py.
25
+ weights_dir (str): Directory to store/download model weights.
26
+ device (str): Inference device ("cpu" or "cuda").
27
+ """
28
+ weights_path = os.path.join(weights_dir, f"{model_key}.pt")
29
+ download_model_if_needed(model_key, weights_path)
30
+
31
+ logger.info(f"Loading Depth model '{model_key}' from MiDaS hub")
32
+ self.device = device
33
+ self.model_type = self._resolve_model_type(model_key)
34
+ self.midas = torch.hub.load("intel-isl/MiDaS", self.model_type).to(self.device).eval()
35
+ self.transform = self._resolve_transform()
36
+
37
+ def _resolve_model_type(self, model_key):
38
+ """
39
+ Maps model_key to MiDaS hub model type.
40
+ """
41
+ mapping = {
42
+ "midas_v21_small_256": "MiDaS_small",
43
+ "midas_v21_384": "MiDaS",
44
+ "dpt_hybrid_384": "DPT_Hybrid",
45
+ "dpt_large_384": "DPT_Large",
46
+ "dpt_swin2_large_384": "DPT_Large", # fallback to DPT_Large if not explicitly supported
47
+ "dpt_beit_large_512": "DPT_Large", # fallback to DPT_Large if not explicitly supported
48
+ }
49
+ return mapping.get(model_key, "MiDaS_small")
50
+
51
+ def _resolve_transform(self):
52
+ """
53
+ Returns the correct transformation pipeline based on model type.
54
+ """
55
+ transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
56
+ if self.model_type == "MiDaS_small":
57
+ return transforms.small_transform
58
+ else:
59
+ return transforms.default_transform
60
+
61
+ def predict(self, image: Image.Image):
62
+ """
63
+ Generates a depth map for the given image.
64
+
65
+ Args:
66
+ image (PIL.Image.Image): Input image.
67
+
68
+ Returns:
69
+ np.ndarray: Depth map as a 2D numpy array.
70
+ """
71
+ logger.info("Running depth estimation")
72
+ input_tensor = self.transform(image).to(self.device)
73
+
74
+ with torch.no_grad():
75
+ prediction = self.midas(input_tensor)
76
+ prediction = torch.nn.functional.interpolate(
77
+ prediction.unsqueeze(1),
78
+ size=image.size[::-1],
79
+ mode="bicubic",
80
+ align_corners=False,
81
+ ).squeeze()
82
+
83
+ depth_map = prediction.cpu().numpy()
84
+ logger.info("Depth estimation completed successfully")
85
+ return depth_map
models/detection/detector.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from PIL import Image, ImageDraw
4
+ import logging
5
+ from ultralytics import YOLO
6
+ from utils.model_downloader import download_model_if_needed
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class ObjectDetector:
11
+ """
12
+ Generalized Object Detection Wrapper for YOLOv5, YOLOv8, and future variants.
13
+ """
14
+
15
+ def __init__(self, model_key="yolov5n-seg", weights_dir="models/detection/weights", device="cpu"):
16
+ """
17
+ Initialize the Object Detection model.
18
+
19
+ Args:
20
+ model_key (str): Model identifier as defined in model_downloader.py.
21
+ weights_dir (str): Directory to store/download model weights.
22
+ device (str): Inference device ("cpu" or "cuda").
23
+ """
24
+ weights_path = os.path.join(weights_dir, f"{model_key}.pt")
25
+ download_model_if_needed(model_key, weights_path)
26
+
27
+ logger.info(f"Loading Object Detection model '{model_key}' from {weights_path}")
28
+ self.device = device
29
+ self.model = YOLO(weights_path)
30
+
31
+ def predict(self, image: Image.Image):
32
+ """
33
+ Run object detection.
34
+
35
+ Args:
36
+ image (PIL.Image.Image): Input image.
37
+
38
+ Returns:
39
+ List[Dict]: List of detected objects with class name, confidence, and bbox.
40
+ """
41
+ logger.info("Running object detection")
42
+ results = self.model(image)
43
+ detections = []
44
+ for r in results:
45
+ for box in r.boxes:
46
+ detections.append({
47
+ "class_name": r.names[int(box.cls)],
48
+ "confidence": float(box.conf),
49
+ "bbox": box.xyxy[0].tolist()
50
+ })
51
+ logger.info(f"Detected {len(detections)} objects")
52
+ return detections
53
+
54
+ def draw(self, image: Image.Image, detections, alpha=0.5):
55
+ """
56
+ Draw bounding boxes on image.
57
+
58
+ Args:
59
+ image (PIL.Image.Image): Input image.
60
+ detections (List[Dict]): Detection results.
61
+ alpha (float): Blend strength.
62
+
63
+ Returns:
64
+ PIL.Image.Image: Image with bounding boxes drawn.
65
+ """
66
+ overlay = image.copy()
67
+ draw = ImageDraw.Draw(overlay)
68
+ for det in detections:
69
+ bbox = det["bbox"]
70
+ label = f'{det["class_name"]} {det["confidence"]:.2f}'
71
+ draw.rectangle(bbox, outline="red", width=2)
72
+ draw.text((bbox[0], bbox[1]), label, fill="red")
73
+ return Image.blend(image, overlay, alpha)
models/segmentation/segmenter.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ from PIL import Image
4
+ import numpy as np
5
+ from torchvision import transforms
6
+ from torchvision.models.segmentation import deeplabv3_resnet50
7
+ from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class Segmenter:
12
+ """
13
+ Generalized Semantic Segmentation Wrapper for SegFormer and DeepLabV3.
14
+ """
15
+
16
+ def __init__(self, model_key="nvidia/segformer-b0-finetuned-ade-512-512", device="cpu"):
17
+ """
18
+ Initialize the segmentation model.
19
+
20
+ Args:
21
+ model_key (str): Model identifier, e.g., Hugging Face model id or 'deeplabv3_resnet50'.
22
+ device (str): Inference device ("cpu" or "cuda").
23
+ """
24
+ logger.info(f"Initializing segmenter with model: {model_key}")
25
+ self.device = device
26
+ self.model_key = model_key
27
+ self.model, self.processor = self._load_model()
28
+
29
+ def _load_model(self):
30
+ """
31
+ Load the segmentation model and processor.
32
+
33
+ Returns:
34
+ Tuple[torch.nn.Module, Optional[Processor]]
35
+ """
36
+ if "segformer" in self.model_key:
37
+ model = SegformerForSemanticSegmentation.from_pretrained(self.model_key).to(self.device)
38
+ processor = SegformerFeatureExtractor.from_pretrained(self.model_key)
39
+ return model, processor
40
+ elif self.model_key == "deeplabv3_resnet50":
41
+ model = deeplabv3_resnet50(pretrained=True).to(self.device).eval()
42
+ return model, None
43
+ else:
44
+ raise ValueError(f"Unsupported model key: {self.model_key}")
45
+
46
+ def predict(self, image: Image.Image):
47
+ """
48
+ Perform segmentation on the input image.
49
+
50
+ Args:
51
+ image (PIL.Image.Image): Input image.
52
+
53
+ Returns:
54
+ np.ndarray: Segmentation mask.
55
+ """
56
+ logger.info("Running segmentation")
57
+
58
+ if "segformer" in self.model_key:
59
+ inputs = self.processor(images=image, return_tensors="pt").to(self.device)
60
+ outputs = self.model(**inputs)
61
+ mask = outputs.logits.argmax(dim=1).squeeze().cpu().numpy()
62
+ return mask
63
+
64
+ elif self.model_key == "deeplabv3_resnet50":
65
+ transform = transforms.Compose([
66
+ transforms.ToTensor(),
67
+ ])
68
+ inputs = transform(image).unsqueeze(0).to(self.device)
69
+ with torch.no_grad():
70
+ outputs = self.model(inputs)["out"]
71
+ mask = outputs.argmax(1).squeeze().cpu().numpy()
72
+ return mask
73
+
74
+ def draw(self, image: Image.Image, mask: np.ndarray, alpha=0.5):
75
+ """
76
+ Overlay the segmentation mask on the input image.
77
+
78
+ Args:
79
+ image (PIL.Image.Image): Original image.
80
+ mask (np.ndarray): Segmentation mask.
81
+ alpha (float): Blend strength.
82
+
83
+ Returns:
84
+ PIL.Image.Image: Image with mask overlay.
85
+ """
86
+ logger.info("Drawing segmentation overlay")
87
+ mask_img = Image.fromarray((mask * 255 / mask.max()).astype(np.uint8)).convert("L").resize(image.size)
88
+ mask_colored = Image.merge("RGB", (mask_img, mask_img, mask_img))
89
+ return Image.blend(image, mask_colored, alpha)
registry.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ # Configure Logger
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def get_model(task: str, model_key: str, device="cpu"):
7
+ """
8
+ Dynamically retrieves the model instance based on the task and model_key.
9
+
10
+ Args:
11
+ task (str): One of "detection", "segmentation", or "depth".
12
+ model_key (str): Model identifier or variant.
13
+ device (str): Device to run inference on ("cpu" or "cuda").
14
+
15
+ Returns:
16
+ object: Initialized model ready for inference.
17
+
18
+ Raises:
19
+ ValueError: If task is unsupported or model loading fails.
20
+ """
21
+ logger.info(f"Request received to load model '{model_key}' for task '{task}' on device '{device}'")
22
+
23
+ try:
24
+ if task == "detection":
25
+ from models.detection.detector import ObjectDetector
26
+ return ObjectDetector(model_key=model_key, device=device)
27
+
28
+ elif task == "segmentation":
29
+ from models.segmentation.segmenter import Segmenter
30
+ return Segmenter(model_key=model_key, device=device)
31
+
32
+ elif task == "depth":
33
+ from models.depth.depth_estimator import DepthEstimator
34
+ return DepthEstimator(model_key=model_key, device=device)
35
+
36
+ else:
37
+ error_msg = f"Unsupported task '{task}'. Valid options are: 'detection', 'segmentation', 'depth'."
38
+ logger.error(error_msg)
39
+ raise ValueError(error_msg)
40
+
41
+ except Exception as e:
42
+ logger.error(f"Error while loading model '{model_key}' for task '{task}': {e}")
43
+ raise
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Libraries
2
+ gradio>=3.50 # Web interface for the application
3
+ torch>=2.0 # PyTorch for deep learning models
4
+ torchvision>=0.15 # TorchVision for pre-trained models and utilities
5
+ ultralytics>=8.0 # YOLO models for object detection
6
+ opencv-python>=4.7 # OpenCV for video and image processing
7
+
8
+ # Utility Libraries
9
+ numpy>=1.21 # Numerical computations
10
+ Pillow>=9.0 # Image processing
11
+ requests>=2.28 # HTTP requests for fetching media
12
+ timeout-decorator>=0.5.0 # Timeout handling for long-running tasks
13
+ tqdm>=4.64 # Progress bars for iterative tasks
14
+
15
+ # Hugging Face Support
16
+ transformers>=4.30 # Hugging Face Transformers for SegFormer models
17
+ sentencepiece # Tokenization for Hugging Face models
18
+ huggingface-hub>=0.15 # Model hub integration for Hugging Face
19
+
20
+ # Data Handling
21
+ pandas>=1.3 # Data manipulation and structured data handling
22
+ scipy>=1.7 # Scientific computing for advanced numerical
utils/file_utils.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def ensure_dir(directory):
7
+ """
8
+ Ensures the given directory exists. Creates it if it does not.
9
+
10
+ Args:
11
+ directory (str): The directory path to check or create.
12
+ """
13
+ if not os.path.exists(directory):
14
+ logger.info(f"Creating directory: {directory}")
15
+ os.makedirs(directory)
16
+ else:
17
+ logger.info(f"Directory already exists: {directory}")
utils/math_utils.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import logging
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def normalize_array(arr):
7
+ """
8
+ Normalizes a numpy array to the range [0, 1].
9
+
10
+ Args:
11
+ arr (numpy.ndarray): The array to normalize.
12
+
13
+ Returns:
14
+ numpy.ndarray: The normalized array.
15
+ """
16
+ logger.info("Normalizing array to range [0, 1].")
17
+ return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
utils/model_downloader.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import urllib.request
3
+ import logging
4
+
5
+ # Configure Logger
6
+ logger = logging.getLogger(__name__)
7
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
8
+
9
+ # Model URLs for downloading if not present locally
10
+ MODEL_URLS = {
11
+ "dpt_hybrid_384": "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
12
+ "midas_v21_small_256": "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
13
+ "yolov5n-seg": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n-seg.pt",
14
+ "yolov5s-seg": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt",
15
+ }
16
+
17
+
18
+ def download_model_if_needed(model_key: str, save_path: str):
19
+ """
20
+ Downloads a model file if it does not already exist.
21
+
22
+ Args:
23
+ model_key (str): The key representing the model in MODEL_URLS.
24
+ save_path (str): The local path where the model should be saved.
25
+
26
+ Raises:
27
+ ValueError: If the model_key does not exist in MODEL_URLS.
28
+ """
29
+ url = MODEL_URLS.get(model_key)
30
+
31
+ if not url:
32
+ logger.error(f"Model key '{model_key}' is not defined in MODEL_URLS.")
33
+ raise ValueError(f"No URL configured for model key: {model_key}")
34
+
35
+ if os.path.exists(save_path):
36
+ logger.info(f"Model '{model_key}' already exists at '{save_path}'. Skipping download.")
37
+ return
38
+
39
+ try:
40
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
41
+ logger.info(f"Downloading '{model_key}' from '{url}' to '{save_path}'")
42
+ urllib.request.urlretrieve(url, save_path)
43
+ logger.info(f"Successfully downloaded '{model_key}' to '{save_path}'")
44
+ except Exception as e:
45
+ logger.error(f"Failed to download '{model_key}': {e}")
46
+ raise
utils/video_utils.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import tempfile
4
+ import logging
5
+ from typing import List
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def extract_frames(video_path: str, skip: int = 1) -> List:
11
+ """
12
+ Extract frames from a video.
13
+
14
+ Args:
15
+ video_path (str): Path to the video file.
16
+ skip (int): Number of frames to skip between extractions.
17
+
18
+ Returns:
19
+ List of BGR frames as numpy arrays.
20
+ """
21
+ logger.info(f"Extracting frames from video: {video_path}")
22
+ frames = []
23
+ cap = cv2.VideoCapture(video_path)
24
+ frame_count = 0
25
+ while True:
26
+ ret, frame = cap.read()
27
+ if not ret:
28
+ break
29
+ if frame_count % skip == 0:
30
+ frames.append(frame)
31
+ frame_count += 1
32
+ cap.release()
33
+ logger.info(f"Extracted {len(frames)} frames")
34
+ return frames