Spaces:
Running
on
Zero
Running
on
Zero
Main Commit
Browse filesFiles from private github repo
- .gitattributes +7 -35
- .venv/Scripts/python.exe +3 -0
- .venv/Scripts/pythonw.exe +3 -0
- .venv/pyvenv.cfg +3 -0
- Dockerfile +23 -0
- LICENSE +201 -0
- README.md +94 -14
- app.py +464 -0
- assets/sample_images/Man_in_office.jpg +0 -0
- assets/sample_images/Street_in_Japan.jpg +3 -0
- assets/ui/logo.png +3 -0
- core/describe_scene.py +59 -0
- models/__init__.py +0 -0
- models/depth/depth_estimator.py +85 -0
- models/detection/detector.py +73 -0
- models/segmentation/segmenter.py +89 -0
- registry.py +43 -0
- requirements.txt +22 -0
- utils/file_utils.py +17 -0
- utils/math_utils.py +17 -0
- utils/model_downloader.py +46 -0
- utils/video_utils.py +34 -0
.gitattributes
CHANGED
@@ -1,35 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
*.
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
# Auto detect text files and perform LF normalization
|
2 |
+
* text=auto
|
3 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
4 |
+
.venv/Scripts/python.exe filter=lfs diff=lfs merge=lfs -text
|
5 |
+
.venv/Scripts/pythonw.exe filter=lfs diff=lfs merge=lfs -text
|
6 |
+
assets/sample_images/Street_in_Japan.jpg filter=lfs diff=lfs merge=lfs -text
|
7 |
+
assets/ui/logo.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/Scripts/python.exe
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:beefaea165effa6069ba50bdd4d3a5cb7bcd6173629dd879af45985129e9038b
|
3 |
+
size 242920
|
.venv/Scripts/pythonw.exe
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f67a7ae6f44fa2c2892ad83757baaf18b5b3be9f6becac66d6d6fea41c19819
|
3 |
+
size 232688
|
.venv/pyvenv.cfg
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
home = D:\
|
2 |
+
include-system-site-packages = false
|
3 |
+
version = 3.10.0
|
Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Base Python image
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Install OS dependencies
|
5 |
+
RUN apt-get update && apt-get install -y \
|
6 |
+
libgl1-mesa-glx \
|
7 |
+
&& rm -rf /var/lib/apt/lists/*
|
8 |
+
|
9 |
+
# Set working directory
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
# Copy all files
|
13 |
+
COPY . .
|
14 |
+
|
15 |
+
# Install Python dependencies
|
16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
17 |
+
|
18 |
+
# Expose port (Streamlit default)
|
19 |
+
EXPOSE 8501
|
20 |
+
|
21 |
+
# Run Streamlit app
|
22 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
23 |
+
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,14 +1,94 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# UVIS - Unified Visual Intelligence System
|
3 |
+
|
4 |
+
### A Lightweight Web-Based Visual Perception Demo
|
5 |
+
|
6 |
+
> **Try it online**: [uvis.deecoded.io](https://uvis.deecoded.io)
|
7 |
+
> **GitHub**: [github.com/DurgaDeepakValluri/UVIS](https://github.com/DurgaDeepakValluri/UVIS)
|
8 |
+
|
9 |
+
---
|
10 |
+
|
11 |
+
## Overview
|
12 |
+
|
13 |
+
**UVIS** (Unified Visual Intelligence System) is a **lightweight, web-based visual perception demo**, originally conceptualized as a **spin-off while building Percepta**—a larger modular perception framework.
|
14 |
+
|
15 |
+
The goal of UVIS is to make **scene understanding tools more accessible**, allowing anyone to try object detection, semantic segmentation, and depth estimation through a clean web interface, without requiring local setup.
|
16 |
+
|
17 |
+
UVIS currently runs on **[Render.com](https://www.render.com)'s Free Tier**, using **lightweight models** to ensure the experience remains stable on limited resources.
|
18 |
+
|
19 |
+
---
|
20 |
+
|
21 |
+
## Key Features
|
22 |
+
|
23 |
+
| Capability | Description |
|
24 |
+
| ---------------------------- | ----------------------------------------------------------------------------------- |
|
25 |
+
| 🟢 **Object Detection** | YOLOv5-Nano & YOLOv5-Small for fast, low-resource detection. |
|
26 |
+
| 🟢 **Semantic Segmentation** | SegFormer-B0 and DeepLabV3-ResNet50 for general-purpose scenes. |
|
27 |
+
| 🟢 **Depth Estimation** | MiDaS Small & DPT Lite for per-pixel depth estimation. |
|
28 |
+
| 🖼️ **Scene Blueprint** | Unified overlay combining all selected tasks. |
|
29 |
+
| 📊 **Scene Metrics** | Scene complexity scoring and agent-friendly summaries. |
|
30 |
+
| 📦 **Downloadable Results** | JSON, overlay images, and ZIP bundles. |
|
31 |
+
| 🌐 **Web-First Design** | No installation needed—hosted live at [uvis.deecoded.io](https://uvis.deecoded.io). |
|
32 |
+
| 🛠️ **Open Source** | Contribution-friendly, easy to extend and improve. |
|
33 |
+
|
34 |
+
---
|
35 |
+
|
36 |
+
### Current Limitations & Roadmap
|
37 |
+
|
38 |
+
UVIS is designed for **lightweight demos** on **free-tier hosting**, which means:
|
39 |
+
|
40 |
+
* Models are optimized for speed and minimal compute.
|
41 |
+
* Only **image input** is supported at this time.
|
42 |
+
|
43 |
+
> As the project grows and higher hosting tiers become available, the roadmap includes:
|
44 |
+
>
|
45 |
+
> * **Video input support**
|
46 |
+
> * **Lightweight SLAM**
|
47 |
+
> * **Natural language scene descriptions**
|
48 |
+
> * **Higher-capacity, more accurate models**
|
49 |
+
|
50 |
+
---
|
51 |
+
|
52 |
+
## Architecture Highlights
|
53 |
+
|
54 |
+
* **Modular Python Backend with Model Registry**
|
55 |
+
* **Streamlit-Based Interactive Web UI**
|
56 |
+
* **HuggingFace Transformers & TorchVision Integration**
|
57 |
+
* **Lightweight Model Support (Render-Compatible)**
|
58 |
+
* **Structured JSON Output for AI Agents**
|
59 |
+
* **Robust Error Handling and Logging**
|
60 |
+
|
61 |
+
---
|
62 |
+
|
63 |
+
## 🤝 Contributing
|
64 |
+
|
65 |
+
UVIS is **open-source** and welcomes contributions.
|
66 |
+
You can:
|
67 |
+
|
68 |
+
* Suggest new features
|
69 |
+
* Improve the web interface
|
70 |
+
* Extend perception tasks
|
71 |
+
* Report issues or bugs
|
72 |
+
|
73 |
+
### 💻 **Clone and Run Locally**
|
74 |
+
|
75 |
+
```bash
|
76 |
+
git clone https://github.com/DurgaDeepakValluri/UVIS.git
|
77 |
+
cd UVIS
|
78 |
+
pip install -r requirements.txt
|
79 |
+
```
|
80 |
+
|
81 |
+
---
|
82 |
+
|
83 |
+
## 🌐 Live Demo
|
84 |
+
|
85 |
+
> **Explore it online at [uvis.deecoded.io](https://uvis.deecoded.io)**
|
86 |
+
> Upload an image, select your tasks, and view the results—all in your browser.
|
87 |
+
|
88 |
+
---
|
89 |
+
|
90 |
+
## 📝 License
|
91 |
+
|
92 |
+
Apache 2.0 License. Free for personal and commercial use with attribution.
|
93 |
+
© 2025 Durga Deepak Valluri
|
94 |
+
|
app.py
ADDED
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# UVIS - Gradio App with Upload, URL & Video Support
|
2 |
+
"""
|
3 |
+
This script launches the UVIS (Unified Visual Intelligence System) as a Gradio Web App.
|
4 |
+
Supports image, video, and URL-based media inputs for detection, segmentation, and depth estimation.
|
5 |
+
Outputs include scene blueprint, structured JSON, and downloadable results.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import gradio as gr
|
9 |
+
from PIL import Image
|
10 |
+
import numpy as np
|
11 |
+
import os
|
12 |
+
import io
|
13 |
+
import zipfile
|
14 |
+
import json
|
15 |
+
import tempfile
|
16 |
+
import logging
|
17 |
+
import cv2
|
18 |
+
import requests
|
19 |
+
from urllib.parse import urlparse
|
20 |
+
from registry import get_model
|
21 |
+
from core.describe_scene import describe_scene
|
22 |
+
import uuid
|
23 |
+
import time
|
24 |
+
import timeout_decorator
|
25 |
+
import socket
|
26 |
+
import ipaddress
|
27 |
+
|
28 |
+
# Setup logging
|
29 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
+
|
32 |
+
# Model mappings
|
33 |
+
DETECTION_MODEL_MAP = {
|
34 |
+
"YOLOv5-Nano": "yolov5n-seg",
|
35 |
+
"YOLOv5-Small": "yolov5s-seg",
|
36 |
+
"YOLOv8-Small": "yolov8s",
|
37 |
+
"YOLOv8-Large": "yolov8l",
|
38 |
+
"RT-DETR": "rtdetr" # For future support
|
39 |
+
}
|
40 |
+
|
41 |
+
SEGMENTATION_MODEL_MAP = {
|
42 |
+
"SegFormer-B0": "nvidia/segformer-b0-finetuned-ade-512-512",
|
43 |
+
"SegFormer-B5": "nvidia/segformer-b5-finetuned-ade-512-512",
|
44 |
+
"DeepLabV3-ResNet50": "deeplabv3_resnet50"
|
45 |
+
}
|
46 |
+
|
47 |
+
DEPTH_MODEL_MAP = {
|
48 |
+
"MiDaS v21 Small 256": "midas_v21_small_256",
|
49 |
+
"MiDaS v21 384": "midas_v21_384",
|
50 |
+
"DPT Hybrid 384": "dpt_hybrid_384",
|
51 |
+
"DPT Swin2 Large 384": "dpt_swin2_large_384",
|
52 |
+
"DPT Beit Large 512": "dpt_beit_large_512"
|
53 |
+
}
|
54 |
+
|
55 |
+
# Resource Limits
|
56 |
+
MAX_IMAGE_MB = 5
|
57 |
+
MAX_IMAGE_RES = (1920, 1080)
|
58 |
+
MAX_VIDEO_MB = 50
|
59 |
+
MAX_VIDEO_DURATION = 30 # seconds
|
60 |
+
|
61 |
+
# Utility Functions
|
62 |
+
def format_error(message):
|
63 |
+
"""Formats error messages for consistent user feedback."""
|
64 |
+
return {"error": message}
|
65 |
+
|
66 |
+
def toggle_visibility(show, *components):
|
67 |
+
"""Toggles visibility for multiple Gradio components."""
|
68 |
+
return [gr.update(visible=show) for _ in components]
|
69 |
+
|
70 |
+
def generate_session_id():
|
71 |
+
"""Generates a unique session ID for tracking inputs."""
|
72 |
+
return str(uuid.uuid4())
|
73 |
+
|
74 |
+
def log_runtime(start_time):
|
75 |
+
"""Logs the runtime of a process."""
|
76 |
+
elapsed_time = time.time() - start_time
|
77 |
+
logger.info(f"Process completed in {elapsed_time:.2f} seconds.")
|
78 |
+
return elapsed_time
|
79 |
+
|
80 |
+
def is_public_ip(url):
|
81 |
+
"""
|
82 |
+
Checks whether the resolved IP address of a URL is public (non-local).
|
83 |
+
Prevents SSRF by blocking internal addresses like 127.0.0.1 or 192.168.x.x.
|
84 |
+
"""
|
85 |
+
try:
|
86 |
+
hostname = urlparse(url).hostname
|
87 |
+
ip = socket.gethostbyname(hostname)
|
88 |
+
ip_obj = ipaddress.ip_address(ip)
|
89 |
+
return ip_obj.is_global # Only allow globally routable IPs
|
90 |
+
except Exception as e:
|
91 |
+
logger.warning(f"URL IP validation failed: {e}")
|
92 |
+
return False
|
93 |
+
|
94 |
+
|
95 |
+
def fetch_media_from_url(url):
|
96 |
+
"""
|
97 |
+
Downloads media from a URL. Supports images and videos.
|
98 |
+
Returns PIL.Image or video file path.
|
99 |
+
"""
|
100 |
+
logger.info(f"Fetching media from URL: {url}")
|
101 |
+
if not is_public_ip(url):
|
102 |
+
logger.warning("Blocked non-public URL request (possible SSRF).")
|
103 |
+
return None
|
104 |
+
|
105 |
+
try:
|
106 |
+
parsed_url = urlparse(url)
|
107 |
+
ext = os.path.splitext(parsed_url.path)[-1].lower()
|
108 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
109 |
+
r = requests.get(url, headers=headers, timeout=10)
|
110 |
+
|
111 |
+
if r.status_code != 200 or len(r.content) > 50 * 1024 * 1024:
|
112 |
+
logger.warning(f"Download failed or file too large.")
|
113 |
+
return None
|
114 |
+
|
115 |
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
|
116 |
+
tmp_file.write(r.content)
|
117 |
+
tmp_file.close()
|
118 |
+
|
119 |
+
if ext in [".jpg", ".jpeg", ".png"]:
|
120 |
+
return Image.open(tmp_file.name).convert("RGB")
|
121 |
+
elif ext in [".mp4", ".avi", ".mov"]:
|
122 |
+
return tmp_file.name
|
123 |
+
else:
|
124 |
+
logger.warning("Unsupported file type from URL.")
|
125 |
+
return None
|
126 |
+
except Exception as e:
|
127 |
+
logger.error(f"URL fetch failed: {e}")
|
128 |
+
return None
|
129 |
+
|
130 |
+
# Input Validation Functions
|
131 |
+
def validate_image(img):
|
132 |
+
"""
|
133 |
+
Validates the uploaded image based on size and resolution limits.
|
134 |
+
|
135 |
+
Args:
|
136 |
+
img (PIL.Image.Image): Image to validate.
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
|
140 |
+
"""
|
141 |
+
logger.info("Validating uploaded image.")
|
142 |
+
try:
|
143 |
+
buffer = io.BytesIO()
|
144 |
+
img.save(buffer, format="PNG")
|
145 |
+
size_mb = len(buffer.getvalue()) / (1024 * 1024)
|
146 |
+
|
147 |
+
if size_mb > MAX_IMAGE_MB:
|
148 |
+
logger.warning("Image exceeds size limit of 5MB.")
|
149 |
+
return False, "Image exceeds 5MB limit."
|
150 |
+
|
151 |
+
if img.width > MAX_IMAGE_RES[0] or img.height > MAX_IMAGE_RES[1]:
|
152 |
+
logger.warning("Image resolution exceeds 1920x1080.")
|
153 |
+
return False, "Image resolution exceeds 1920x1080."
|
154 |
+
|
155 |
+
logger.info("Image validation passed.")
|
156 |
+
return True, None
|
157 |
+
except Exception as e:
|
158 |
+
logger.error(f"Error validating image: {e}")
|
159 |
+
return False, str(e)
|
160 |
+
|
161 |
+
def validate_video(path):
|
162 |
+
"""
|
163 |
+
Validates the uploaded video based on size and duration limits.
|
164 |
+
|
165 |
+
Args:
|
166 |
+
path (str): Path to the video file.
|
167 |
+
|
168 |
+
Returns:
|
169 |
+
Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
|
170 |
+
"""
|
171 |
+
logger.info(f"Validating video file at: {path}")
|
172 |
+
try:
|
173 |
+
size_mb = os.path.getsize(path) / (1024 * 1024)
|
174 |
+
if size_mb > MAX_VIDEO_MB:
|
175 |
+
logger.warning("Video exceeds size limit of 50MB.")
|
176 |
+
return False, "Video exceeds 50MB limit."
|
177 |
+
|
178 |
+
cap = cv2.VideoCapture(path)
|
179 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
180 |
+
frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
181 |
+
duration = frames / fps if fps else 0
|
182 |
+
cap.release()
|
183 |
+
|
184 |
+
if duration > MAX_VIDEO_DURATION:
|
185 |
+
logger.warning("Video exceeds 30 seconds duration limit.")
|
186 |
+
return False, "Video exceeds 30 seconds duration limit."
|
187 |
+
|
188 |
+
logger.info("Video validation passed.")
|
189 |
+
return True, None
|
190 |
+
except Exception as e:
|
191 |
+
logger.error(f"Error validating video: {e}")
|
192 |
+
return False, str(e)
|
193 |
+
|
194 |
+
# Input Resolution
|
195 |
+
def resolve_input(mode, uploaded_img, uploaded_imgs, uploaded_vid, url):
|
196 |
+
"""
|
197 |
+
Resolves the input source based on user selection.
|
198 |
+
Supports single image, multiple images, video, or URL-based media.
|
199 |
+
|
200 |
+
Args:
|
201 |
+
mode (str): Input mode - 'Upload' or 'URL'.
|
202 |
+
uploaded_img (PIL.Image.Image): Single uploaded image.
|
203 |
+
uploaded_imgs (List[PIL.Image.Image]): List of uploaded images (batch).
|
204 |
+
uploaded_vid (str): Uploaded video file path.
|
205 |
+
url (str): URL pointing to media content.
|
206 |
+
|
207 |
+
Returns:
|
208 |
+
List[Union[PIL.Image.Image, str, None]]: A list of media items to process.
|
209 |
+
"""
|
210 |
+
logger.info(f"Resolving input based on mode: {mode}")
|
211 |
+
try:
|
212 |
+
if mode == "Upload":
|
213 |
+
# Prefer batch if provided
|
214 |
+
if uploaded_imgs and len(uploaded_imgs) > 0:
|
215 |
+
return uploaded_imgs
|
216 |
+
elif uploaded_img:
|
217 |
+
return [uploaded_img]
|
218 |
+
elif uploaded_vid:
|
219 |
+
return [uploaded_vid]
|
220 |
+
else:
|
221 |
+
logger.warning("No valid upload provided.")
|
222 |
+
return None
|
223 |
+
|
224 |
+
elif mode == "URL":
|
225 |
+
media_from_url = fetch_media_from_url(url)
|
226 |
+
if media_from_url:
|
227 |
+
return [media_from_url]
|
228 |
+
else:
|
229 |
+
logger.warning("Failed to fetch valid media from URL.")
|
230 |
+
return None
|
231 |
+
|
232 |
+
else:
|
233 |
+
logger.warning("Invalid input mode selected.")
|
234 |
+
return None
|
235 |
+
|
236 |
+
except Exception as e:
|
237 |
+
logger.error(f"Error resolving input: {e}")
|
238 |
+
return None
|
239 |
+
|
240 |
+
@timeout_decorator.timeout(35, use_signals=False) # 35 sec limit per image
|
241 |
+
def process_image(
|
242 |
+
image: Image.Image,
|
243 |
+
run_det: bool,
|
244 |
+
det_model: str,
|
245 |
+
det_confidence: float,
|
246 |
+
run_seg: bool,
|
247 |
+
seg_model: str,
|
248 |
+
run_depth: bool,
|
249 |
+
depth_model: str,
|
250 |
+
blend: float
|
251 |
+
):
|
252 |
+
"""
|
253 |
+
Runs selected perception tasks on the input image and packages results.
|
254 |
+
|
255 |
+
Args:
|
256 |
+
image (PIL.Image): Input image.
|
257 |
+
run_det (bool): Run object detection.
|
258 |
+
det_model (str): Detection model key.
|
259 |
+
det_confidence (float): Detection confidence threshold.
|
260 |
+
run_seg (bool): Run segmentation.
|
261 |
+
seg_model (str): Segmentation model key.
|
262 |
+
run_depth (bool): Run depth estimation.
|
263 |
+
depth_model (str): Depth model key.
|
264 |
+
blend (float): Overlay blend alpha (0.0 - 1.0).
|
265 |
+
|
266 |
+
Returns:
|
267 |
+
Tuple[Image, dict, Tuple[str, bytes]]: Final image, scene JSON, and downloadable ZIP.
|
268 |
+
"""
|
269 |
+
logger.info("Starting image processing pipeline.")
|
270 |
+
start_time = time.time()
|
271 |
+
outputs, scene = {}, {}
|
272 |
+
combined_np = np.array(image)
|
273 |
+
|
274 |
+
try:
|
275 |
+
# Detection
|
276 |
+
if run_det:
|
277 |
+
logger.info(f"Running detection with model: {det_model}")
|
278 |
+
load_start = time.time()
|
279 |
+
model = get_model("detection", DETECTION_MODEL_MAP[det_model], device="cpu")
|
280 |
+
logger.info(f"{det_model} detection model loaded in {time.time() - load_start:.2f} seconds.")
|
281 |
+
boxes = model.predict(image, conf_threshold=det_confidence)
|
282 |
+
overlay = model.draw(image, boxes)
|
283 |
+
combined_np = np.array(overlay)
|
284 |
+
buf = io.BytesIO()
|
285 |
+
overlay.save(buf, format="PNG")
|
286 |
+
outputs["detection.png"] = buf.getvalue()
|
287 |
+
scene["detection"] = boxes
|
288 |
+
|
289 |
+
# Segmentation
|
290 |
+
if run_seg:
|
291 |
+
logger.info(f"Running segmentation with model: {seg_model}")
|
292 |
+
load_start = time.time()
|
293 |
+
model = get_model("segmentation", SEGMENTATION_MODEL_MAP[seg_model], device="cpu")
|
294 |
+
logger.info(f"{seg_model} segmentation model loaded in {time.time() - load_start:.2f} seconds.")
|
295 |
+
mask = model.predict(image)
|
296 |
+
overlay = model.draw(image, mask, alpha=blend)
|
297 |
+
combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(overlay), blend, 0)
|
298 |
+
buf = io.BytesIO()
|
299 |
+
overlay.save(buf, format="PNG")
|
300 |
+
outputs["segmentation.png"] = buf.getvalue()
|
301 |
+
scene["segmentation"] = mask.tolist()
|
302 |
+
|
303 |
+
# Depth Estimation
|
304 |
+
if run_depth:
|
305 |
+
logger.info(f"Running depth estimation with model: {depth_model}")
|
306 |
+
load_start = time.time()
|
307 |
+
model = get_model("depth", DEPTH_MODEL_MAP[depth_model], device="cpu")
|
308 |
+
logger.info(f"{depth_model} depth model loaded in {time.time() - load_start:.2f} seconds.")
|
309 |
+
dmap = model.predict(image)
|
310 |
+
norm_dmap = ((dmap - dmap.min()) / (dmap.ptp()) * 255).astype(np.uint8)
|
311 |
+
d_pil = Image.fromarray(norm_dmap)
|
312 |
+
combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(d_pil.convert("RGB")), blend, 0)
|
313 |
+
buf = io.BytesIO()
|
314 |
+
d_pil.save(buf, format="PNG")
|
315 |
+
outputs["depth_map.png"] = buf.getvalue()
|
316 |
+
scene["depth"] = dmap.tolist()
|
317 |
+
|
318 |
+
# Final image overlay
|
319 |
+
final_img = Image.fromarray(combined_np)
|
320 |
+
buf = io.BytesIO()
|
321 |
+
final_img.save(buf, format="PNG")
|
322 |
+
outputs["scene_blueprint.png"] = buf.getvalue()
|
323 |
+
|
324 |
+
# Scene description
|
325 |
+
try:
|
326 |
+
scene_json = describe_scene(**scene)
|
327 |
+
except Exception as e:
|
328 |
+
logger.warning(f"describe_scene failed: {e}")
|
329 |
+
scene_json = {"error": str(e)}
|
330 |
+
telemetry = {
|
331 |
+
"session_id": generate_session_id(),
|
332 |
+
"runtime_sec": round(log_runtime(start_time), 2),
|
333 |
+
"used_models": {
|
334 |
+
"detection": det_model if run_det else None,
|
335 |
+
"segmentation": seg_model if run_seg else None,
|
336 |
+
"depth": depth_model if run_depth else None
|
337 |
+
}
|
338 |
+
}
|
339 |
+
scene_json["telemetry"] = telemetry
|
340 |
+
|
341 |
+
outputs["scene_description.json"] = json.dumps(scene_json, indent=2).encode("utf-8")
|
342 |
+
|
343 |
+
# ZIP file creation
|
344 |
+
zip_buf = io.BytesIO()
|
345 |
+
with zipfile.ZipFile(zip_buf, "w") as zipf:
|
346 |
+
for name, data in outputs.items():
|
347 |
+
zipf.writestr(name, data)
|
348 |
+
|
349 |
+
elapsed = log_runtime(start_time)
|
350 |
+
logger.info(f"Image processing completed in {elapsed:.2f} seconds.")
|
351 |
+
|
352 |
+
return final_img, scene_json, ("uvis_results.zip", zip_buf.getvalue())
|
353 |
+
|
354 |
+
except Exception as e:
|
355 |
+
logger.error(f"Error in processing pipeline: {e}")
|
356 |
+
return None, {"error": str(e)}, None
|
357 |
+
|
358 |
+
# Main Handler
|
359 |
+
def handle(mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend):
|
360 |
+
"""
|
361 |
+
Master handler for resolving input and processing.
|
362 |
+
Returns outputs for Gradio interface.
|
363 |
+
"""
|
364 |
+
session_id = generate_session_id()
|
365 |
+
logger.info(f"Session ID: {session_id} | Handler activated with mode: {mode}")
|
366 |
+
start_time = time.time()
|
367 |
+
|
368 |
+
media = resolve_input(mode, img, imgs, vid, url)
|
369 |
+
if not media:
|
370 |
+
return None, format_error("No valid input provided. Please check your upload or URL."), None
|
371 |
+
|
372 |
+
results = []
|
373 |
+
for single_media in media:
|
374 |
+
if isinstance(single_media, str): # Video file
|
375 |
+
valid, err = validate_video(single_media)
|
376 |
+
if not valid:
|
377 |
+
return None, format_error(err), None
|
378 |
+
cap = cv2.VideoCapture(single_media)
|
379 |
+
ret, frame = cap.read()
|
380 |
+
cap.release()
|
381 |
+
if not ret:
|
382 |
+
return None, format_error("Failed to read video frame."), None
|
383 |
+
single_media = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
384 |
+
|
385 |
+
if isinstance(single_media, Image.Image):
|
386 |
+
valid, err = validate_image(single_media)
|
387 |
+
if not valid:
|
388 |
+
return None, format_error(err), None
|
389 |
+
try:
|
390 |
+
return process_image(single_media, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend)
|
391 |
+
except timeout_decorator.timeout_decorator.TimeoutError:
|
392 |
+
logger.error("Image processing timed out.")
|
393 |
+
return None, format_error("Processing timed out. Try a smaller image or simpler model."), None
|
394 |
+
|
395 |
+
logger.warning("Unsupported media type resolved.")
|
396 |
+
log_runtime(start_time)
|
397 |
+
return None, format_error("Invalid input. Please check your upload or URL."), None
|
398 |
+
|
399 |
+
# Gradio Interface
|
400 |
+
with gr.Blocks() as demo:
|
401 |
+
gr.Markdown("## Unified Visual Intelligence System (UVIS)")
|
402 |
+
|
403 |
+
# Input Mode Selection
|
404 |
+
mode = gr.Radio(["Upload", "URL"], value="Upload", label="Input Mode")
|
405 |
+
img = gr.Image(type="pil", label="Upload Image")
|
406 |
+
imgs = gr.Gallery(label="Upload Multiple Images (Up to 5)").style(grid=[5], height="auto")
|
407 |
+
vid = gr.Video(label="Upload Video (<= 30s)")
|
408 |
+
url = gr.Textbox(label="URL (Image/Video)")
|
409 |
+
|
410 |
+
# Task Selection with parameters
|
411 |
+
with gr.Accordion("Object Detection Settings", open=False):
|
412 |
+
run_det = gr.Checkbox(label="Enable Object Detection")
|
413 |
+
det_model = gr.Dropdown(list(DETECTION_MODEL_MAP), label="Detection Model", visible=False)
|
414 |
+
det_confidence = gr.Slider(0.1, 1.0, 0.5, label="Detection Confidence Threshold", visible=False)
|
415 |
+
|
416 |
+
with gr.Accordion("Semantic Segmentation Settings", open=False):
|
417 |
+
run_seg = gr.Checkbox(label="Enable Segmentation")
|
418 |
+
seg_model = gr.Dropdown(list(SEGMENTATION_MODEL_MAP), label="Segmentation Model", visible=False)
|
419 |
+
|
420 |
+
with gr.Accordion("Depth Estimation Settings", open=False):
|
421 |
+
run_depth = gr.Checkbox(label="Enable Depth Estimation")
|
422 |
+
depth_model = gr.Dropdown(list(DEPTH_MODEL_MAP), label="Depth Model", visible=False)
|
423 |
+
|
424 |
+
blend = gr.Slider(0.0, 1.0, 0.5, label="Overlay Blend")
|
425 |
+
|
426 |
+
# Run Button
|
427 |
+
run = gr.Button("Run Analysis")
|
428 |
+
|
429 |
+
# Output Tabs
|
430 |
+
with gr.Tab("Scene JSON"):
|
431 |
+
json_out = gr.JSON()
|
432 |
+
with gr.Tab("Scene Blueprint"):
|
433 |
+
img_out = gr.Image()
|
434 |
+
with gr.Tab("Download"):
|
435 |
+
zip_out = gr.File()
|
436 |
+
|
437 |
+
# Attach Visibility Logic
|
438 |
+
run_det.change(toggle_visibility, run_det, [det_model, det_confidence])
|
439 |
+
run_seg.change(toggle_visibility, run_seg, [seg_model])
|
440 |
+
run_depth.change(toggle_visibility, run_depth, [depth_model])
|
441 |
+
|
442 |
+
# Button Click Event
|
443 |
+
run.click(
|
444 |
+
handle,
|
445 |
+
inputs=[mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend],
|
446 |
+
outputs=[img_out, json_out, zip_out]
|
447 |
+
)
|
448 |
+
|
449 |
+
# Footer Section
|
450 |
+
gr.Markdown("---")
|
451 |
+
gr.Markdown(
|
452 |
+
"""
|
453 |
+
<div style='text-align: center; font-size: 14px;'>
|
454 |
+
Built by <b>Durga Deepak Valluri</b><br>
|
455 |
+
<a href="https://github.com/DurgaDeepakValluri/UVIS" target="_blank">GitHub</a> |
|
456 |
+
<a href="https://deecoded.io" target="_blank">Website</a> |
|
457 |
+
<a href="https://www.linkedin.com/in/durga-deepak-valluri" target="_blank">LinkedIn</a>
|
458 |
+
</div>
|
459 |
+
""",
|
460 |
+
unsafe_allow_html=True
|
461 |
+
)
|
462 |
+
|
463 |
+
# Launch the Gradio App
|
464 |
+
demo.launch()
|
assets/sample_images/Man_in_office.jpg
ADDED
![]() |
assets/sample_images/Street_in_Japan.jpg
ADDED
![]() |
Git LFS Details
|
assets/ui/logo.png
ADDED
![]() |
Git LFS Details
|
core/describe_scene.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import logging
|
3 |
+
|
4 |
+
logger = logging.getLogger(__name__)
|
5 |
+
|
6 |
+
def describe_scene(detection=None, segmentation=None, depth=None):
|
7 |
+
"""
|
8 |
+
Generates a structured scene summary with metrics for detection, segmentation, and depth.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
detection (list): List of detected objects with class names and bounding boxes.
|
12 |
+
segmentation (numpy.ndarray): Segmentation mask as a 2D numpy array.
|
13 |
+
depth (numpy.ndarray): Depth map as a 2D numpy array.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
dict: Structured scene description with metrics.
|
17 |
+
"""
|
18 |
+
logger.info("Generating scene summary...")
|
19 |
+
description = {"scene_summary": {}}
|
20 |
+
|
21 |
+
# Detection Summary with Metrics
|
22 |
+
if detection:
|
23 |
+
logger.info("Adding detection results to scene summary.")
|
24 |
+
description["scene_summary"]["objects"] = detection
|
25 |
+
confidences = [obj.get("confidence", 0) for obj in detection]
|
26 |
+
description["scene_summary"]["detection_metrics"] = {
|
27 |
+
"objects_detected": len(detection),
|
28 |
+
"average_confidence": float(np.mean(confidences)) if confidences else 0.0
|
29 |
+
}
|
30 |
+
|
31 |
+
# Segmentation Summary with Coverage Metrics
|
32 |
+
if segmentation is not None:
|
33 |
+
logger.info("Summarizing segmentation coverage.")
|
34 |
+
unique, counts = np.unique(segmentation, return_counts=True)
|
35 |
+
total = segmentation.size
|
36 |
+
coverage = [
|
37 |
+
{"class_id": int(class_id), "coverage": f"{(count / total) * 100:.2f}%"}
|
38 |
+
for class_id, count in zip(unique, counts)
|
39 |
+
]
|
40 |
+
dominant_class = max(coverage, key=lambda x: float(x["coverage"].strip('%')))
|
41 |
+
description["scene_summary"]["segmentation_summary"] = coverage
|
42 |
+
description["scene_summary"]["dominant_class"] = dominant_class
|
43 |
+
|
44 |
+
# Depth Summary with Metrics
|
45 |
+
if depth is not None:
|
46 |
+
logger.info("Summarizing depth information.")
|
47 |
+
mean_depth = float(np.mean(depth))
|
48 |
+
min_depth = float(np.min(depth))
|
49 |
+
max_depth = float(np.max(depth))
|
50 |
+
std_depth = float(np.std(depth))
|
51 |
+
description["scene_summary"]["depth_summary"] = {
|
52 |
+
"mean_depth": mean_depth,
|
53 |
+
"min_depth": min_depth,
|
54 |
+
"max_depth": max_depth,
|
55 |
+
"std_depth": std_depth
|
56 |
+
}
|
57 |
+
|
58 |
+
logger.info("Scene summary generation complete.")
|
59 |
+
return description
|
models/__init__.py
ADDED
File without changes
|
models/depth/depth_estimator.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
import logging
|
6 |
+
from utils.model_downloader import download_model_if_needed
|
7 |
+
|
8 |
+
# Configure Logger
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
11 |
+
|
12 |
+
|
13 |
+
class DepthEstimator:
|
14 |
+
"""
|
15 |
+
Generalized Depth Estimation Model Wrapper for MiDaS and DPT models.
|
16 |
+
Supports: MiDaS v2.1 Small, MiDaS v2.1 Large, DPT Hybrid, DPT Large.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, model_key="midas_v21_small_256", weights_dir="models/depth/weights", device="cpu"):
|
20 |
+
"""
|
21 |
+
Initialize the Depth Estimation model.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
model_key (str): Model identifier as defined in model_downloader.py.
|
25 |
+
weights_dir (str): Directory to store/download model weights.
|
26 |
+
device (str): Inference device ("cpu" or "cuda").
|
27 |
+
"""
|
28 |
+
weights_path = os.path.join(weights_dir, f"{model_key}.pt")
|
29 |
+
download_model_if_needed(model_key, weights_path)
|
30 |
+
|
31 |
+
logger.info(f"Loading Depth model '{model_key}' from MiDaS hub")
|
32 |
+
self.device = device
|
33 |
+
self.model_type = self._resolve_model_type(model_key)
|
34 |
+
self.midas = torch.hub.load("intel-isl/MiDaS", self.model_type).to(self.device).eval()
|
35 |
+
self.transform = self._resolve_transform()
|
36 |
+
|
37 |
+
def _resolve_model_type(self, model_key):
|
38 |
+
"""
|
39 |
+
Maps model_key to MiDaS hub model type.
|
40 |
+
"""
|
41 |
+
mapping = {
|
42 |
+
"midas_v21_small_256": "MiDaS_small",
|
43 |
+
"midas_v21_384": "MiDaS",
|
44 |
+
"dpt_hybrid_384": "DPT_Hybrid",
|
45 |
+
"dpt_large_384": "DPT_Large",
|
46 |
+
"dpt_swin2_large_384": "DPT_Large", # fallback to DPT_Large if not explicitly supported
|
47 |
+
"dpt_beit_large_512": "DPT_Large", # fallback to DPT_Large if not explicitly supported
|
48 |
+
}
|
49 |
+
return mapping.get(model_key, "MiDaS_small")
|
50 |
+
|
51 |
+
def _resolve_transform(self):
|
52 |
+
"""
|
53 |
+
Returns the correct transformation pipeline based on model type.
|
54 |
+
"""
|
55 |
+
transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
|
56 |
+
if self.model_type == "MiDaS_small":
|
57 |
+
return transforms.small_transform
|
58 |
+
else:
|
59 |
+
return transforms.default_transform
|
60 |
+
|
61 |
+
def predict(self, image: Image.Image):
|
62 |
+
"""
|
63 |
+
Generates a depth map for the given image.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
image (PIL.Image.Image): Input image.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
np.ndarray: Depth map as a 2D numpy array.
|
70 |
+
"""
|
71 |
+
logger.info("Running depth estimation")
|
72 |
+
input_tensor = self.transform(image).to(self.device)
|
73 |
+
|
74 |
+
with torch.no_grad():
|
75 |
+
prediction = self.midas(input_tensor)
|
76 |
+
prediction = torch.nn.functional.interpolate(
|
77 |
+
prediction.unsqueeze(1),
|
78 |
+
size=image.size[::-1],
|
79 |
+
mode="bicubic",
|
80 |
+
align_corners=False,
|
81 |
+
).squeeze()
|
82 |
+
|
83 |
+
depth_map = prediction.cpu().numpy()
|
84 |
+
logger.info("Depth estimation completed successfully")
|
85 |
+
return depth_map
|
models/detection/detector.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
from PIL import Image, ImageDraw
|
4 |
+
import logging
|
5 |
+
from ultralytics import YOLO
|
6 |
+
from utils.model_downloader import download_model_if_needed
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
class ObjectDetector:
|
11 |
+
"""
|
12 |
+
Generalized Object Detection Wrapper for YOLOv5, YOLOv8, and future variants.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, model_key="yolov5n-seg", weights_dir="models/detection/weights", device="cpu"):
|
16 |
+
"""
|
17 |
+
Initialize the Object Detection model.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
model_key (str): Model identifier as defined in model_downloader.py.
|
21 |
+
weights_dir (str): Directory to store/download model weights.
|
22 |
+
device (str): Inference device ("cpu" or "cuda").
|
23 |
+
"""
|
24 |
+
weights_path = os.path.join(weights_dir, f"{model_key}.pt")
|
25 |
+
download_model_if_needed(model_key, weights_path)
|
26 |
+
|
27 |
+
logger.info(f"Loading Object Detection model '{model_key}' from {weights_path}")
|
28 |
+
self.device = device
|
29 |
+
self.model = YOLO(weights_path)
|
30 |
+
|
31 |
+
def predict(self, image: Image.Image):
|
32 |
+
"""
|
33 |
+
Run object detection.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
image (PIL.Image.Image): Input image.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
List[Dict]: List of detected objects with class name, confidence, and bbox.
|
40 |
+
"""
|
41 |
+
logger.info("Running object detection")
|
42 |
+
results = self.model(image)
|
43 |
+
detections = []
|
44 |
+
for r in results:
|
45 |
+
for box in r.boxes:
|
46 |
+
detections.append({
|
47 |
+
"class_name": r.names[int(box.cls)],
|
48 |
+
"confidence": float(box.conf),
|
49 |
+
"bbox": box.xyxy[0].tolist()
|
50 |
+
})
|
51 |
+
logger.info(f"Detected {len(detections)} objects")
|
52 |
+
return detections
|
53 |
+
|
54 |
+
def draw(self, image: Image.Image, detections, alpha=0.5):
|
55 |
+
"""
|
56 |
+
Draw bounding boxes on image.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
image (PIL.Image.Image): Input image.
|
60 |
+
detections (List[Dict]): Detection results.
|
61 |
+
alpha (float): Blend strength.
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
PIL.Image.Image: Image with bounding boxes drawn.
|
65 |
+
"""
|
66 |
+
overlay = image.copy()
|
67 |
+
draw = ImageDraw.Draw(overlay)
|
68 |
+
for det in detections:
|
69 |
+
bbox = det["bbox"]
|
70 |
+
label = f'{det["class_name"]} {det["confidence"]:.2f}'
|
71 |
+
draw.rectangle(bbox, outline="red", width=2)
|
72 |
+
draw.text((bbox[0], bbox[1]), label, fill="red")
|
73 |
+
return Image.blend(image, overlay, alpha)
|
models/segmentation/segmenter.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
import numpy as np
|
5 |
+
from torchvision import transforms
|
6 |
+
from torchvision.models.segmentation import deeplabv3_resnet50
|
7 |
+
from transformers import SegformerForSemanticSegmentation, SegformerFeatureExtractor
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
class Segmenter:
|
12 |
+
"""
|
13 |
+
Generalized Semantic Segmentation Wrapper for SegFormer and DeepLabV3.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, model_key="nvidia/segformer-b0-finetuned-ade-512-512", device="cpu"):
|
17 |
+
"""
|
18 |
+
Initialize the segmentation model.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
model_key (str): Model identifier, e.g., Hugging Face model id or 'deeplabv3_resnet50'.
|
22 |
+
device (str): Inference device ("cpu" or "cuda").
|
23 |
+
"""
|
24 |
+
logger.info(f"Initializing segmenter with model: {model_key}")
|
25 |
+
self.device = device
|
26 |
+
self.model_key = model_key
|
27 |
+
self.model, self.processor = self._load_model()
|
28 |
+
|
29 |
+
def _load_model(self):
|
30 |
+
"""
|
31 |
+
Load the segmentation model and processor.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
Tuple[torch.nn.Module, Optional[Processor]]
|
35 |
+
"""
|
36 |
+
if "segformer" in self.model_key:
|
37 |
+
model = SegformerForSemanticSegmentation.from_pretrained(self.model_key).to(self.device)
|
38 |
+
processor = SegformerFeatureExtractor.from_pretrained(self.model_key)
|
39 |
+
return model, processor
|
40 |
+
elif self.model_key == "deeplabv3_resnet50":
|
41 |
+
model = deeplabv3_resnet50(pretrained=True).to(self.device).eval()
|
42 |
+
return model, None
|
43 |
+
else:
|
44 |
+
raise ValueError(f"Unsupported model key: {self.model_key}")
|
45 |
+
|
46 |
+
def predict(self, image: Image.Image):
|
47 |
+
"""
|
48 |
+
Perform segmentation on the input image.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
image (PIL.Image.Image): Input image.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
np.ndarray: Segmentation mask.
|
55 |
+
"""
|
56 |
+
logger.info("Running segmentation")
|
57 |
+
|
58 |
+
if "segformer" in self.model_key:
|
59 |
+
inputs = self.processor(images=image, return_tensors="pt").to(self.device)
|
60 |
+
outputs = self.model(**inputs)
|
61 |
+
mask = outputs.logits.argmax(dim=1).squeeze().cpu().numpy()
|
62 |
+
return mask
|
63 |
+
|
64 |
+
elif self.model_key == "deeplabv3_resnet50":
|
65 |
+
transform = transforms.Compose([
|
66 |
+
transforms.ToTensor(),
|
67 |
+
])
|
68 |
+
inputs = transform(image).unsqueeze(0).to(self.device)
|
69 |
+
with torch.no_grad():
|
70 |
+
outputs = self.model(inputs)["out"]
|
71 |
+
mask = outputs.argmax(1).squeeze().cpu().numpy()
|
72 |
+
return mask
|
73 |
+
|
74 |
+
def draw(self, image: Image.Image, mask: np.ndarray, alpha=0.5):
|
75 |
+
"""
|
76 |
+
Overlay the segmentation mask on the input image.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
image (PIL.Image.Image): Original image.
|
80 |
+
mask (np.ndarray): Segmentation mask.
|
81 |
+
alpha (float): Blend strength.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
PIL.Image.Image: Image with mask overlay.
|
85 |
+
"""
|
86 |
+
logger.info("Drawing segmentation overlay")
|
87 |
+
mask_img = Image.fromarray((mask * 255 / mask.max()).astype(np.uint8)).convert("L").resize(image.size)
|
88 |
+
mask_colored = Image.merge("RGB", (mask_img, mask_img, mask_img))
|
89 |
+
return Image.blend(image, mask_colored, alpha)
|
registry.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
# Configure Logger
|
4 |
+
logger = logging.getLogger(__name__)
|
5 |
+
|
6 |
+
def get_model(task: str, model_key: str, device="cpu"):
|
7 |
+
"""
|
8 |
+
Dynamically retrieves the model instance based on the task and model_key.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
task (str): One of "detection", "segmentation", or "depth".
|
12 |
+
model_key (str): Model identifier or variant.
|
13 |
+
device (str): Device to run inference on ("cpu" or "cuda").
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
object: Initialized model ready for inference.
|
17 |
+
|
18 |
+
Raises:
|
19 |
+
ValueError: If task is unsupported or model loading fails.
|
20 |
+
"""
|
21 |
+
logger.info(f"Request received to load model '{model_key}' for task '{task}' on device '{device}'")
|
22 |
+
|
23 |
+
try:
|
24 |
+
if task == "detection":
|
25 |
+
from models.detection.detector import ObjectDetector
|
26 |
+
return ObjectDetector(model_key=model_key, device=device)
|
27 |
+
|
28 |
+
elif task == "segmentation":
|
29 |
+
from models.segmentation.segmenter import Segmenter
|
30 |
+
return Segmenter(model_key=model_key, device=device)
|
31 |
+
|
32 |
+
elif task == "depth":
|
33 |
+
from models.depth.depth_estimator import DepthEstimator
|
34 |
+
return DepthEstimator(model_key=model_key, device=device)
|
35 |
+
|
36 |
+
else:
|
37 |
+
error_msg = f"Unsupported task '{task}'. Valid options are: 'detection', 'segmentation', 'depth'."
|
38 |
+
logger.error(error_msg)
|
39 |
+
raise ValueError(error_msg)
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
logger.error(f"Error while loading model '{model_key}' for task '{task}': {e}")
|
43 |
+
raise
|
requirements.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core Libraries
|
2 |
+
gradio>=3.50 # Web interface for the application
|
3 |
+
torch>=2.0 # PyTorch for deep learning models
|
4 |
+
torchvision>=0.15 # TorchVision for pre-trained models and utilities
|
5 |
+
ultralytics>=8.0 # YOLO models for object detection
|
6 |
+
opencv-python>=4.7 # OpenCV for video and image processing
|
7 |
+
|
8 |
+
# Utility Libraries
|
9 |
+
numpy>=1.21 # Numerical computations
|
10 |
+
Pillow>=9.0 # Image processing
|
11 |
+
requests>=2.28 # HTTP requests for fetching media
|
12 |
+
timeout-decorator>=0.5.0 # Timeout handling for long-running tasks
|
13 |
+
tqdm>=4.64 # Progress bars for iterative tasks
|
14 |
+
|
15 |
+
# Hugging Face Support
|
16 |
+
transformers>=4.30 # Hugging Face Transformers for SegFormer models
|
17 |
+
sentencepiece # Tokenization for Hugging Face models
|
18 |
+
huggingface-hub>=0.15 # Model hub integration for Hugging Face
|
19 |
+
|
20 |
+
# Data Handling
|
21 |
+
pandas>=1.3 # Data manipulation and structured data handling
|
22 |
+
scipy>=1.7 # Scientific computing for advanced numerical
|
utils/file_utils.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
|
4 |
+
logger = logging.getLogger(__name__)
|
5 |
+
|
6 |
+
def ensure_dir(directory):
|
7 |
+
"""
|
8 |
+
Ensures the given directory exists. Creates it if it does not.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
directory (str): The directory path to check or create.
|
12 |
+
"""
|
13 |
+
if not os.path.exists(directory):
|
14 |
+
logger.info(f"Creating directory: {directory}")
|
15 |
+
os.makedirs(directory)
|
16 |
+
else:
|
17 |
+
logger.info(f"Directory already exists: {directory}")
|
utils/math_utils.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import logging
|
3 |
+
|
4 |
+
logger = logging.getLogger(__name__)
|
5 |
+
|
6 |
+
def normalize_array(arr):
|
7 |
+
"""
|
8 |
+
Normalizes a numpy array to the range [0, 1].
|
9 |
+
|
10 |
+
Args:
|
11 |
+
arr (numpy.ndarray): The array to normalize.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
numpy.ndarray: The normalized array.
|
15 |
+
"""
|
16 |
+
logger.info("Normalizing array to range [0, 1].")
|
17 |
+
return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
|
utils/model_downloader.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import urllib.request
|
3 |
+
import logging
|
4 |
+
|
5 |
+
# Configure Logger
|
6 |
+
logger = logging.getLogger(__name__)
|
7 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
8 |
+
|
9 |
+
# Model URLs for downloading if not present locally
|
10 |
+
MODEL_URLS = {
|
11 |
+
"dpt_hybrid_384": "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
|
12 |
+
"midas_v21_small_256": "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
|
13 |
+
"yolov5n-seg": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n-seg.pt",
|
14 |
+
"yolov5s-seg": "https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt",
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
def download_model_if_needed(model_key: str, save_path: str):
|
19 |
+
"""
|
20 |
+
Downloads a model file if it does not already exist.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
model_key (str): The key representing the model in MODEL_URLS.
|
24 |
+
save_path (str): The local path where the model should be saved.
|
25 |
+
|
26 |
+
Raises:
|
27 |
+
ValueError: If the model_key does not exist in MODEL_URLS.
|
28 |
+
"""
|
29 |
+
url = MODEL_URLS.get(model_key)
|
30 |
+
|
31 |
+
if not url:
|
32 |
+
logger.error(f"Model key '{model_key}' is not defined in MODEL_URLS.")
|
33 |
+
raise ValueError(f"No URL configured for model key: {model_key}")
|
34 |
+
|
35 |
+
if os.path.exists(save_path):
|
36 |
+
logger.info(f"Model '{model_key}' already exists at '{save_path}'. Skipping download.")
|
37 |
+
return
|
38 |
+
|
39 |
+
try:
|
40 |
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
41 |
+
logger.info(f"Downloading '{model_key}' from '{url}' to '{save_path}'")
|
42 |
+
urllib.request.urlretrieve(url, save_path)
|
43 |
+
logger.info(f"Successfully downloaded '{model_key}' to '{save_path}'")
|
44 |
+
except Exception as e:
|
45 |
+
logger.error(f"Failed to download '{model_key}': {e}")
|
46 |
+
raise
|
utils/video_utils.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import logging
|
5 |
+
from typing import List
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.INFO)
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
def extract_frames(video_path: str, skip: int = 1) -> List:
|
11 |
+
"""
|
12 |
+
Extract frames from a video.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
video_path (str): Path to the video file.
|
16 |
+
skip (int): Number of frames to skip between extractions.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
List of BGR frames as numpy arrays.
|
20 |
+
"""
|
21 |
+
logger.info(f"Extracting frames from video: {video_path}")
|
22 |
+
frames = []
|
23 |
+
cap = cv2.VideoCapture(video_path)
|
24 |
+
frame_count = 0
|
25 |
+
while True:
|
26 |
+
ret, frame = cap.read()
|
27 |
+
if not ret:
|
28 |
+
break
|
29 |
+
if frame_count % skip == 0:
|
30 |
+
frames.append(frame)
|
31 |
+
frame_count += 1
|
32 |
+
cap.release()
|
33 |
+
logger.info(f"Extracted {len(frames)} frames")
|
34 |
+
return frames
|