DurgaDeepak commited on
Commit
f289d8a
·
verified ·
1 Parent(s): 6f1fa06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +463 -463
app.py CHANGED
@@ -1,464 +1,464 @@
1
- # UVIS - Gradio App with Upload, URL & Video Support
2
- """
3
- This script launches the UVIS (Unified Visual Intelligence System) as a Gradio Web App.
4
- Supports image, video, and URL-based media inputs for detection, segmentation, and depth estimation.
5
- Outputs include scene blueprint, structured JSON, and downloadable results.
6
- """
7
-
8
- import gradio as gr
9
- from PIL import Image
10
- import numpy as np
11
- import os
12
- import io
13
- import zipfile
14
- import json
15
- import tempfile
16
- import logging
17
- import cv2
18
- import requests
19
- from urllib.parse import urlparse
20
- from registry import get_model
21
- from core.describe_scene import describe_scene
22
- import uuid
23
- import time
24
- import timeout_decorator
25
- import socket
26
- import ipaddress
27
-
28
- # Setup logging
29
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
30
- logger = logging.getLogger(__name__)
31
-
32
- # Model mappings
33
- DETECTION_MODEL_MAP = {
34
- "YOLOv5-Nano": "yolov5n-seg",
35
- "YOLOv5-Small": "yolov5s-seg",
36
- "YOLOv8-Small": "yolov8s",
37
- "YOLOv8-Large": "yolov8l",
38
- "RT-DETR": "rtdetr" # For future support
39
- }
40
-
41
- SEGMENTATION_MODEL_MAP = {
42
- "SegFormer-B0": "nvidia/segformer-b0-finetuned-ade-512-512",
43
- "SegFormer-B5": "nvidia/segformer-b5-finetuned-ade-512-512",
44
- "DeepLabV3-ResNet50": "deeplabv3_resnet50"
45
- }
46
-
47
- DEPTH_MODEL_MAP = {
48
- "MiDaS v21 Small 256": "midas_v21_small_256",
49
- "MiDaS v21 384": "midas_v21_384",
50
- "DPT Hybrid 384": "dpt_hybrid_384",
51
- "DPT Swin2 Large 384": "dpt_swin2_large_384",
52
- "DPT Beit Large 512": "dpt_beit_large_512"
53
- }
54
-
55
- # Resource Limits
56
- MAX_IMAGE_MB = 5
57
- MAX_IMAGE_RES = (1920, 1080)
58
- MAX_VIDEO_MB = 50
59
- MAX_VIDEO_DURATION = 30 # seconds
60
-
61
- # Utility Functions
62
- def format_error(message):
63
- """Formats error messages for consistent user feedback."""
64
- return {"error": message}
65
-
66
- def toggle_visibility(show, *components):
67
- """Toggles visibility for multiple Gradio components."""
68
- return [gr.update(visible=show) for _ in components]
69
-
70
- def generate_session_id():
71
- """Generates a unique session ID for tracking inputs."""
72
- return str(uuid.uuid4())
73
-
74
- def log_runtime(start_time):
75
- """Logs the runtime of a process."""
76
- elapsed_time = time.time() - start_time
77
- logger.info(f"Process completed in {elapsed_time:.2f} seconds.")
78
- return elapsed_time
79
-
80
- def is_public_ip(url):
81
- """
82
- Checks whether the resolved IP address of a URL is public (non-local).
83
- Prevents SSRF by blocking internal addresses like 127.0.0.1 or 192.168.x.x.
84
- """
85
- try:
86
- hostname = urlparse(url).hostname
87
- ip = socket.gethostbyname(hostname)
88
- ip_obj = ipaddress.ip_address(ip)
89
- return ip_obj.is_global # Only allow globally routable IPs
90
- except Exception as e:
91
- logger.warning(f"URL IP validation failed: {e}")
92
- return False
93
-
94
-
95
- def fetch_media_from_url(url):
96
- """
97
- Downloads media from a URL. Supports images and videos.
98
- Returns PIL.Image or video file path.
99
- """
100
- logger.info(f"Fetching media from URL: {url}")
101
- if not is_public_ip(url):
102
- logger.warning("Blocked non-public URL request (possible SSRF).")
103
- return None
104
-
105
- try:
106
- parsed_url = urlparse(url)
107
- ext = os.path.splitext(parsed_url.path)[-1].lower()
108
- headers = {"User-Agent": "Mozilla/5.0"}
109
- r = requests.get(url, headers=headers, timeout=10)
110
-
111
- if r.status_code != 200 or len(r.content) > 50 * 1024 * 1024:
112
- logger.warning(f"Download failed or file too large.")
113
- return None
114
-
115
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
116
- tmp_file.write(r.content)
117
- tmp_file.close()
118
-
119
- if ext in [".jpg", ".jpeg", ".png"]:
120
- return Image.open(tmp_file.name).convert("RGB")
121
- elif ext in [".mp4", ".avi", ".mov"]:
122
- return tmp_file.name
123
- else:
124
- logger.warning("Unsupported file type from URL.")
125
- return None
126
- except Exception as e:
127
- logger.error(f"URL fetch failed: {e}")
128
- return None
129
-
130
- # Input Validation Functions
131
- def validate_image(img):
132
- """
133
- Validates the uploaded image based on size and resolution limits.
134
-
135
- Args:
136
- img (PIL.Image.Image): Image to validate.
137
-
138
- Returns:
139
- Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
140
- """
141
- logger.info("Validating uploaded image.")
142
- try:
143
- buffer = io.BytesIO()
144
- img.save(buffer, format="PNG")
145
- size_mb = len(buffer.getvalue()) / (1024 * 1024)
146
-
147
- if size_mb > MAX_IMAGE_MB:
148
- logger.warning("Image exceeds size limit of 5MB.")
149
- return False, "Image exceeds 5MB limit."
150
-
151
- if img.width > MAX_IMAGE_RES[0] or img.height > MAX_IMAGE_RES[1]:
152
- logger.warning("Image resolution exceeds 1920x1080.")
153
- return False, "Image resolution exceeds 1920x1080."
154
-
155
- logger.info("Image validation passed.")
156
- return True, None
157
- except Exception as e:
158
- logger.error(f"Error validating image: {e}")
159
- return False, str(e)
160
-
161
- def validate_video(path):
162
- """
163
- Validates the uploaded video based on size and duration limits.
164
-
165
- Args:
166
- path (str): Path to the video file.
167
-
168
- Returns:
169
- Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
170
- """
171
- logger.info(f"Validating video file at: {path}")
172
- try:
173
- size_mb = os.path.getsize(path) / (1024 * 1024)
174
- if size_mb > MAX_VIDEO_MB:
175
- logger.warning("Video exceeds size limit of 50MB.")
176
- return False, "Video exceeds 50MB limit."
177
-
178
- cap = cv2.VideoCapture(path)
179
- fps = cap.get(cv2.CAP_PROP_FPS)
180
- frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
181
- duration = frames / fps if fps else 0
182
- cap.release()
183
-
184
- if duration > MAX_VIDEO_DURATION:
185
- logger.warning("Video exceeds 30 seconds duration limit.")
186
- return False, "Video exceeds 30 seconds duration limit."
187
-
188
- logger.info("Video validation passed.")
189
- return True, None
190
- except Exception as e:
191
- logger.error(f"Error validating video: {e}")
192
- return False, str(e)
193
-
194
- # Input Resolution
195
- def resolve_input(mode, uploaded_img, uploaded_imgs, uploaded_vid, url):
196
- """
197
- Resolves the input source based on user selection.
198
- Supports single image, multiple images, video, or URL-based media.
199
-
200
- Args:
201
- mode (str): Input mode - 'Upload' or 'URL'.
202
- uploaded_img (PIL.Image.Image): Single uploaded image.
203
- uploaded_imgs (List[PIL.Image.Image]): List of uploaded images (batch).
204
- uploaded_vid (str): Uploaded video file path.
205
- url (str): URL pointing to media content.
206
-
207
- Returns:
208
- List[Union[PIL.Image.Image, str, None]]: A list of media items to process.
209
- """
210
- logger.info(f"Resolving input based on mode: {mode}")
211
- try:
212
- if mode == "Upload":
213
- # Prefer batch if provided
214
- if uploaded_imgs and len(uploaded_imgs) > 0:
215
- return uploaded_imgs
216
- elif uploaded_img:
217
- return [uploaded_img]
218
- elif uploaded_vid:
219
- return [uploaded_vid]
220
- else:
221
- logger.warning("No valid upload provided.")
222
- return None
223
-
224
- elif mode == "URL":
225
- media_from_url = fetch_media_from_url(url)
226
- if media_from_url:
227
- return [media_from_url]
228
- else:
229
- logger.warning("Failed to fetch valid media from URL.")
230
- return None
231
-
232
- else:
233
- logger.warning("Invalid input mode selected.")
234
- return None
235
-
236
- except Exception as e:
237
- logger.error(f"Error resolving input: {e}")
238
- return None
239
-
240
- @timeout_decorator.timeout(35, use_signals=False) # 35 sec limit per image
241
- def process_image(
242
- image: Image.Image,
243
- run_det: bool,
244
- det_model: str,
245
- det_confidence: float,
246
- run_seg: bool,
247
- seg_model: str,
248
- run_depth: bool,
249
- depth_model: str,
250
- blend: float
251
- ):
252
- """
253
- Runs selected perception tasks on the input image and packages results.
254
-
255
- Args:
256
- image (PIL.Image): Input image.
257
- run_det (bool): Run object detection.
258
- det_model (str): Detection model key.
259
- det_confidence (float): Detection confidence threshold.
260
- run_seg (bool): Run segmentation.
261
- seg_model (str): Segmentation model key.
262
- run_depth (bool): Run depth estimation.
263
- depth_model (str): Depth model key.
264
- blend (float): Overlay blend alpha (0.0 - 1.0).
265
-
266
- Returns:
267
- Tuple[Image, dict, Tuple[str, bytes]]: Final image, scene JSON, and downloadable ZIP.
268
- """
269
- logger.info("Starting image processing pipeline.")
270
- start_time = time.time()
271
- outputs, scene = {}, {}
272
- combined_np = np.array(image)
273
-
274
- try:
275
- # Detection
276
- if run_det:
277
- logger.info(f"Running detection with model: {det_model}")
278
- load_start = time.time()
279
- model = get_model("detection", DETECTION_MODEL_MAP[det_model], device="cpu")
280
- logger.info(f"{det_model} detection model loaded in {time.time() - load_start:.2f} seconds.")
281
- boxes = model.predict(image, conf_threshold=det_confidence)
282
- overlay = model.draw(image, boxes)
283
- combined_np = np.array(overlay)
284
- buf = io.BytesIO()
285
- overlay.save(buf, format="PNG")
286
- outputs["detection.png"] = buf.getvalue()
287
- scene["detection"] = boxes
288
-
289
- # Segmentation
290
- if run_seg:
291
- logger.info(f"Running segmentation with model: {seg_model}")
292
- load_start = time.time()
293
- model = get_model("segmentation", SEGMENTATION_MODEL_MAP[seg_model], device="cpu")
294
- logger.info(f"{seg_model} segmentation model loaded in {time.time() - load_start:.2f} seconds.")
295
- mask = model.predict(image)
296
- overlay = model.draw(image, mask, alpha=blend)
297
- combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(overlay), blend, 0)
298
- buf = io.BytesIO()
299
- overlay.save(buf, format="PNG")
300
- outputs["segmentation.png"] = buf.getvalue()
301
- scene["segmentation"] = mask.tolist()
302
-
303
- # Depth Estimation
304
- if run_depth:
305
- logger.info(f"Running depth estimation with model: {depth_model}")
306
- load_start = time.time()
307
- model = get_model("depth", DEPTH_MODEL_MAP[depth_model], device="cpu")
308
- logger.info(f"{depth_model} depth model loaded in {time.time() - load_start:.2f} seconds.")
309
- dmap = model.predict(image)
310
- norm_dmap = ((dmap - dmap.min()) / (dmap.ptp()) * 255).astype(np.uint8)
311
- d_pil = Image.fromarray(norm_dmap)
312
- combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(d_pil.convert("RGB")), blend, 0)
313
- buf = io.BytesIO()
314
- d_pil.save(buf, format="PNG")
315
- outputs["depth_map.png"] = buf.getvalue()
316
- scene["depth"] = dmap.tolist()
317
-
318
- # Final image overlay
319
- final_img = Image.fromarray(combined_np)
320
- buf = io.BytesIO()
321
- final_img.save(buf, format="PNG")
322
- outputs["scene_blueprint.png"] = buf.getvalue()
323
-
324
- # Scene description
325
- try:
326
- scene_json = describe_scene(**scene)
327
- except Exception as e:
328
- logger.warning(f"describe_scene failed: {e}")
329
- scene_json = {"error": str(e)}
330
- telemetry = {
331
- "session_id": generate_session_id(),
332
- "runtime_sec": round(log_runtime(start_time), 2),
333
- "used_models": {
334
- "detection": det_model if run_det else None,
335
- "segmentation": seg_model if run_seg else None,
336
- "depth": depth_model if run_depth else None
337
- }
338
- }
339
- scene_json["telemetry"] = telemetry
340
-
341
- outputs["scene_description.json"] = json.dumps(scene_json, indent=2).encode("utf-8")
342
-
343
- # ZIP file creation
344
- zip_buf = io.BytesIO()
345
- with zipfile.ZipFile(zip_buf, "w") as zipf:
346
- for name, data in outputs.items():
347
- zipf.writestr(name, data)
348
-
349
- elapsed = log_runtime(start_time)
350
- logger.info(f"Image processing completed in {elapsed:.2f} seconds.")
351
-
352
- return final_img, scene_json, ("uvis_results.zip", zip_buf.getvalue())
353
-
354
- except Exception as e:
355
- logger.error(f"Error in processing pipeline: {e}")
356
- return None, {"error": str(e)}, None
357
-
358
- # Main Handler
359
- def handle(mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend):
360
- """
361
- Master handler for resolving input and processing.
362
- Returns outputs for Gradio interface.
363
- """
364
- session_id = generate_session_id()
365
- logger.info(f"Session ID: {session_id} | Handler activated with mode: {mode}")
366
- start_time = time.time()
367
-
368
- media = resolve_input(mode, img, imgs, vid, url)
369
- if not media:
370
- return None, format_error("No valid input provided. Please check your upload or URL."), None
371
-
372
- results = []
373
- for single_media in media:
374
- if isinstance(single_media, str): # Video file
375
- valid, err = validate_video(single_media)
376
- if not valid:
377
- return None, format_error(err), None
378
- cap = cv2.VideoCapture(single_media)
379
- ret, frame = cap.read()
380
- cap.release()
381
- if not ret:
382
- return None, format_error("Failed to read video frame."), None
383
- single_media = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
384
-
385
- if isinstance(single_media, Image.Image):
386
- valid, err = validate_image(single_media)
387
- if not valid:
388
- return None, format_error(err), None
389
- try:
390
- return process_image(single_media, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend)
391
- except timeout_decorator.timeout_decorator.TimeoutError:
392
- logger.error("Image processing timed out.")
393
- return None, format_error("Processing timed out. Try a smaller image or simpler model."), None
394
-
395
- logger.warning("Unsupported media type resolved.")
396
- log_runtime(start_time)
397
- return None, format_error("Invalid input. Please check your upload or URL."), None
398
-
399
- # Gradio Interface
400
- with gr.Blocks() as demo:
401
- gr.Markdown("## Unified Visual Intelligence System (UVIS)")
402
-
403
- # Input Mode Selection
404
- mode = gr.Radio(["Upload", "URL"], value="Upload", label="Input Mode")
405
- img = gr.Image(type="pil", label="Upload Image")
406
- imgs = gr.Gallery(label="Upload Multiple Images (Up to 5)").style(grid=[5], height="auto")
407
- vid = gr.Video(label="Upload Video (<= 30s)")
408
- url = gr.Textbox(label="URL (Image/Video)")
409
-
410
- # Task Selection with parameters
411
- with gr.Accordion("Object Detection Settings", open=False):
412
- run_det = gr.Checkbox(label="Enable Object Detection")
413
- det_model = gr.Dropdown(list(DETECTION_MODEL_MAP), label="Detection Model", visible=False)
414
- det_confidence = gr.Slider(0.1, 1.0, 0.5, label="Detection Confidence Threshold", visible=False)
415
-
416
- with gr.Accordion("Semantic Segmentation Settings", open=False):
417
- run_seg = gr.Checkbox(label="Enable Segmentation")
418
- seg_model = gr.Dropdown(list(SEGMENTATION_MODEL_MAP), label="Segmentation Model", visible=False)
419
-
420
- with gr.Accordion("Depth Estimation Settings", open=False):
421
- run_depth = gr.Checkbox(label="Enable Depth Estimation")
422
- depth_model = gr.Dropdown(list(DEPTH_MODEL_MAP), label="Depth Model", visible=False)
423
-
424
- blend = gr.Slider(0.0, 1.0, 0.5, label="Overlay Blend")
425
-
426
- # Run Button
427
- run = gr.Button("Run Analysis")
428
-
429
- # Output Tabs
430
- with gr.Tab("Scene JSON"):
431
- json_out = gr.JSON()
432
- with gr.Tab("Scene Blueprint"):
433
- img_out = gr.Image()
434
- with gr.Tab("Download"):
435
- zip_out = gr.File()
436
-
437
- # Attach Visibility Logic
438
- run_det.change(toggle_visibility, run_det, [det_model, det_confidence])
439
- run_seg.change(toggle_visibility, run_seg, [seg_model])
440
- run_depth.change(toggle_visibility, run_depth, [depth_model])
441
-
442
- # Button Click Event
443
- run.click(
444
- handle,
445
- inputs=[mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend],
446
- outputs=[img_out, json_out, zip_out]
447
- )
448
-
449
- # Footer Section
450
- gr.Markdown("---")
451
- gr.Markdown(
452
- """
453
- <div style='text-align: center; font-size: 14px;'>
454
- Built by <b>Durga Deepak Valluri</b><br>
455
- <a href="https://github.com/DurgaDeepakValluri/UVIS" target="_blank">GitHub</a> |
456
- <a href="https://deecoded.io" target="_blank">Website</a> |
457
- <a href="https://www.linkedin.com/in/durga-deepak-valluri" target="_blank">LinkedIn</a>
458
- </div>
459
- """,
460
- unsafe_allow_html=True
461
- )
462
-
463
- # Launch the Gradio App
464
  demo.launch()
 
1
+ # UVIS - Gradio App with Upload, URL & Video Support
2
+ """
3
+ This script launches the UVIS (Unified Visual Intelligence System) as a Gradio Web App.
4
+ Supports image, video, and URL-based media inputs for detection, segmentation, and depth estimation.
5
+ Outputs include scene blueprint, structured JSON, and downloadable results.
6
+ """
7
+
8
+ import gradio as gr
9
+ from PIL import Image
10
+ import numpy as np
11
+ import os
12
+ import io
13
+ import zipfile
14
+ import json
15
+ import tempfile
16
+ import logging
17
+ import cv2
18
+ import requests
19
+ from urllib.parse import urlparse
20
+ from registry import get_model
21
+ from core.describe_scene import describe_scene
22
+ import uuid
23
+ import time
24
+ import timeout_decorator
25
+ import socket
26
+ import ipaddress
27
+
28
+ # Setup logging
29
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Model mappings
33
+ DETECTION_MODEL_MAP = {
34
+ "YOLOv5-Nano": "yolov5n-seg",
35
+ "YOLOv5-Small": "yolov5s-seg",
36
+ "YOLOv8-Small": "yolov8s",
37
+ "YOLOv8-Large": "yolov8l",
38
+ "RT-DETR": "rtdetr" # For future support
39
+ }
40
+
41
+ SEGMENTATION_MODEL_MAP = {
42
+ "SegFormer-B0": "nvidia/segformer-b0-finetuned-ade-512-512",
43
+ "SegFormer-B5": "nvidia/segformer-b5-finetuned-ade-512-512",
44
+ "DeepLabV3-ResNet50": "deeplabv3_resnet50"
45
+ }
46
+
47
+ DEPTH_MODEL_MAP = {
48
+ "MiDaS v21 Small 256": "midas_v21_small_256",
49
+ "MiDaS v21 384": "midas_v21_384",
50
+ "DPT Hybrid 384": "dpt_hybrid_384",
51
+ "DPT Swin2 Large 384": "dpt_swin2_large_384",
52
+ "DPT Beit Large 512": "dpt_beit_large_512"
53
+ }
54
+
55
+ # Resource Limits
56
+ MAX_IMAGE_MB = 5
57
+ MAX_IMAGE_RES = (1920, 1080)
58
+ MAX_VIDEO_MB = 50
59
+ MAX_VIDEO_DURATION = 30 # seconds
60
+
61
+ # Utility Functions
62
+ def format_error(message):
63
+ """Formats error messages for consistent user feedback."""
64
+ return {"error": message}
65
+
66
+ def toggle_visibility(show, *components):
67
+ """Toggles visibility for multiple Gradio components."""
68
+ return [gr.update(visible=show) for _ in components]
69
+
70
+ def generate_session_id():
71
+ """Generates a unique session ID for tracking inputs."""
72
+ return str(uuid.uuid4())
73
+
74
+ def log_runtime(start_time):
75
+ """Logs the runtime of a process."""
76
+ elapsed_time = time.time() - start_time
77
+ logger.info(f"Process completed in {elapsed_time:.2f} seconds.")
78
+ return elapsed_time
79
+
80
+ def is_public_ip(url):
81
+ """
82
+ Checks whether the resolved IP address of a URL is public (non-local).
83
+ Prevents SSRF by blocking internal addresses like 127.0.0.1 or 192.168.x.x.
84
+ """
85
+ try:
86
+ hostname = urlparse(url).hostname
87
+ ip = socket.gethostbyname(hostname)
88
+ ip_obj = ipaddress.ip_address(ip)
89
+ return ip_obj.is_global # Only allow globally routable IPs
90
+ except Exception as e:
91
+ logger.warning(f"URL IP validation failed: {e}")
92
+ return False
93
+
94
+
95
+ def fetch_media_from_url(url):
96
+ """
97
+ Downloads media from a URL. Supports images and videos.
98
+ Returns PIL.Image or video file path.
99
+ """
100
+ logger.info(f"Fetching media from URL: {url}")
101
+ if not is_public_ip(url):
102
+ logger.warning("Blocked non-public URL request (possible SSRF).")
103
+ return None
104
+
105
+ try:
106
+ parsed_url = urlparse(url)
107
+ ext = os.path.splitext(parsed_url.path)[-1].lower()
108
+ headers = {"User-Agent": "Mozilla/5.0"}
109
+ r = requests.get(url, headers=headers, timeout=10)
110
+
111
+ if r.status_code != 200 or len(r.content) > 50 * 1024 * 1024:
112
+ logger.warning(f"Download failed or file too large.")
113
+ return None
114
+
115
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
116
+ tmp_file.write(r.content)
117
+ tmp_file.close()
118
+
119
+ if ext in [".jpg", ".jpeg", ".png"]:
120
+ return Image.open(tmp_file.name).convert("RGB")
121
+ elif ext in [".mp4", ".avi", ".mov"]:
122
+ return tmp_file.name
123
+ else:
124
+ logger.warning("Unsupported file type from URL.")
125
+ return None
126
+ except Exception as e:
127
+ logger.error(f"URL fetch failed: {e}")
128
+ return None
129
+
130
+ # Input Validation Functions
131
+ def validate_image(img):
132
+ """
133
+ Validates the uploaded image based on size and resolution limits.
134
+
135
+ Args:
136
+ img (PIL.Image.Image): Image to validate.
137
+
138
+ Returns:
139
+ Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
140
+ """
141
+ logger.info("Validating uploaded image.")
142
+ try:
143
+ buffer = io.BytesIO()
144
+ img.save(buffer, format="PNG")
145
+ size_mb = len(buffer.getvalue()) / (1024 * 1024)
146
+
147
+ if size_mb > MAX_IMAGE_MB:
148
+ logger.warning("Image exceeds size limit of 5MB.")
149
+ return False, "Image exceeds 5MB limit."
150
+
151
+ if img.width > MAX_IMAGE_RES[0] or img.height > MAX_IMAGE_RES[1]:
152
+ logger.warning("Image resolution exceeds 1920x1080.")
153
+ return False, "Image resolution exceeds 1920x1080."
154
+
155
+ logger.info("Image validation passed.")
156
+ return True, None
157
+ except Exception as e:
158
+ logger.error(f"Error validating image: {e}")
159
+ return False, str(e)
160
+
161
+ def validate_video(path):
162
+ """
163
+ Validates the uploaded video based on size and duration limits.
164
+
165
+ Args:
166
+ path (str): Path to the video file.
167
+
168
+ Returns:
169
+ Tuple[bool, str or None]: (True, None) if valid; (False, reason) otherwise.
170
+ """
171
+ logger.info(f"Validating video file at: {path}")
172
+ try:
173
+ size_mb = os.path.getsize(path) / (1024 * 1024)
174
+ if size_mb > MAX_VIDEO_MB:
175
+ logger.warning("Video exceeds size limit of 50MB.")
176
+ return False, "Video exceeds 50MB limit."
177
+
178
+ cap = cv2.VideoCapture(path)
179
+ fps = cap.get(cv2.CAP_PROP_FPS)
180
+ frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
181
+ duration = frames / fps if fps else 0
182
+ cap.release()
183
+
184
+ if duration > MAX_VIDEO_DURATION:
185
+ logger.warning("Video exceeds 30 seconds duration limit.")
186
+ return False, "Video exceeds 30 seconds duration limit."
187
+
188
+ logger.info("Video validation passed.")
189
+ return True, None
190
+ except Exception as e:
191
+ logger.error(f"Error validating video: {e}")
192
+ return False, str(e)
193
+
194
+ # Input Resolution
195
+ def resolve_input(mode, uploaded_img, uploaded_imgs, uploaded_vid, url):
196
+ """
197
+ Resolves the input source based on user selection.
198
+ Supports single image, multiple images, video, or URL-based media.
199
+
200
+ Args:
201
+ mode (str): Input mode - 'Upload' or 'URL'.
202
+ uploaded_img (PIL.Image.Image): Single uploaded image.
203
+ uploaded_imgs (List[PIL.Image.Image]): List of uploaded images (batch).
204
+ uploaded_vid (str): Uploaded video file path.
205
+ url (str): URL pointing to media content.
206
+
207
+ Returns:
208
+ List[Union[PIL.Image.Image, str, None]]: A list of media items to process.
209
+ """
210
+ logger.info(f"Resolving input based on mode: {mode}")
211
+ try:
212
+ if mode == "Upload":
213
+ # Prefer batch if provided
214
+ if uploaded_imgs and len(uploaded_imgs) > 0:
215
+ return uploaded_imgs
216
+ elif uploaded_img:
217
+ return [uploaded_img]
218
+ elif uploaded_vid:
219
+ return [uploaded_vid]
220
+ else:
221
+ logger.warning("No valid upload provided.")
222
+ return None
223
+
224
+ elif mode == "URL":
225
+ media_from_url = fetch_media_from_url(url)
226
+ if media_from_url:
227
+ return [media_from_url]
228
+ else:
229
+ logger.warning("Failed to fetch valid media from URL.")
230
+ return None
231
+
232
+ else:
233
+ logger.warning("Invalid input mode selected.")
234
+ return None
235
+
236
+ except Exception as e:
237
+ logger.error(f"Error resolving input: {e}")
238
+ return None
239
+
240
+ @timeout_decorator.timeout(35, use_signals=False) # 35 sec limit per image
241
+ def process_image(
242
+ image: Image.Image,
243
+ run_det: bool,
244
+ det_model: str,
245
+ det_confidence: float,
246
+ run_seg: bool,
247
+ seg_model: str,
248
+ run_depth: bool,
249
+ depth_model: str,
250
+ blend: float
251
+ ):
252
+ """
253
+ Runs selected perception tasks on the input image and packages results.
254
+
255
+ Args:
256
+ image (PIL.Image): Input image.
257
+ run_det (bool): Run object detection.
258
+ det_model (str): Detection model key.
259
+ det_confidence (float): Detection confidence threshold.
260
+ run_seg (bool): Run segmentation.
261
+ seg_model (str): Segmentation model key.
262
+ run_depth (bool): Run depth estimation.
263
+ depth_model (str): Depth model key.
264
+ blend (float): Overlay blend alpha (0.0 - 1.0).
265
+
266
+ Returns:
267
+ Tuple[Image, dict, Tuple[str, bytes]]: Final image, scene JSON, and downloadable ZIP.
268
+ """
269
+ logger.info("Starting image processing pipeline.")
270
+ start_time = time.time()
271
+ outputs, scene = {}, {}
272
+ combined_np = np.array(image)
273
+
274
+ try:
275
+ # Detection
276
+ if run_det:
277
+ logger.info(f"Running detection with model: {det_model}")
278
+ load_start = time.time()
279
+ model = get_model("detection", DETECTION_MODEL_MAP[det_model], device="cpu")
280
+ logger.info(f"{det_model} detection model loaded in {time.time() - load_start:.2f} seconds.")
281
+ boxes = model.predict(image, conf_threshold=det_confidence)
282
+ overlay = model.draw(image, boxes)
283
+ combined_np = np.array(overlay)
284
+ buf = io.BytesIO()
285
+ overlay.save(buf, format="PNG")
286
+ outputs["detection.png"] = buf.getvalue()
287
+ scene["detection"] = boxes
288
+
289
+ # Segmentation
290
+ if run_seg:
291
+ logger.info(f"Running segmentation with model: {seg_model}")
292
+ load_start = time.time()
293
+ model = get_model("segmentation", SEGMENTATION_MODEL_MAP[seg_model], device="cpu")
294
+ logger.info(f"{seg_model} segmentation model loaded in {time.time() - load_start:.2f} seconds.")
295
+ mask = model.predict(image)
296
+ overlay = model.draw(image, mask, alpha=blend)
297
+ combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(overlay), blend, 0)
298
+ buf = io.BytesIO()
299
+ overlay.save(buf, format="PNG")
300
+ outputs["segmentation.png"] = buf.getvalue()
301
+ scene["segmentation"] = mask.tolist()
302
+
303
+ # Depth Estimation
304
+ if run_depth:
305
+ logger.info(f"Running depth estimation with model: {depth_model}")
306
+ load_start = time.time()
307
+ model = get_model("depth", DEPTH_MODEL_MAP[depth_model], device="cpu")
308
+ logger.info(f"{depth_model} depth model loaded in {time.time() - load_start:.2f} seconds.")
309
+ dmap = model.predict(image)
310
+ norm_dmap = ((dmap - dmap.min()) / (dmap.ptp()) * 255).astype(np.uint8)
311
+ d_pil = Image.fromarray(norm_dmap)
312
+ combined_np = cv2.addWeighted(combined_np, 1 - blend, np.array(d_pil.convert("RGB")), blend, 0)
313
+ buf = io.BytesIO()
314
+ d_pil.save(buf, format="PNG")
315
+ outputs["depth_map.png"] = buf.getvalue()
316
+ scene["depth"] = dmap.tolist()
317
+
318
+ # Final image overlay
319
+ final_img = Image.fromarray(combined_np)
320
+ buf = io.BytesIO()
321
+ final_img.save(buf, format="PNG")
322
+ outputs["scene_blueprint.png"] = buf.getvalue()
323
+
324
+ # Scene description
325
+ try:
326
+ scene_json = describe_scene(**scene)
327
+ except Exception as e:
328
+ logger.warning(f"describe_scene failed: {e}")
329
+ scene_json = {"error": str(e)}
330
+ telemetry = {
331
+ "session_id": generate_session_id(),
332
+ "runtime_sec": round(log_runtime(start_time), 2),
333
+ "used_models": {
334
+ "detection": det_model if run_det else None,
335
+ "segmentation": seg_model if run_seg else None,
336
+ "depth": depth_model if run_depth else None
337
+ }
338
+ }
339
+ scene_json["telemetry"] = telemetry
340
+
341
+ outputs["scene_description.json"] = json.dumps(scene_json, indent=2).encode("utf-8")
342
+
343
+ # ZIP file creation
344
+ zip_buf = io.BytesIO()
345
+ with zipfile.ZipFile(zip_buf, "w") as zipf:
346
+ for name, data in outputs.items():
347
+ zipf.writestr(name, data)
348
+
349
+ elapsed = log_runtime(start_time)
350
+ logger.info(f"Image processing completed in {elapsed:.2f} seconds.")
351
+
352
+ return final_img, scene_json, ("uvis_results.zip", zip_buf.getvalue())
353
+
354
+ except Exception as e:
355
+ logger.error(f"Error in processing pipeline: {e}")
356
+ return None, {"error": str(e)}, None
357
+
358
+ # Main Handler
359
+ def handle(mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend):
360
+ """
361
+ Master handler for resolving input and processing.
362
+ Returns outputs for Gradio interface.
363
+ """
364
+ session_id = generate_session_id()
365
+ logger.info(f"Session ID: {session_id} | Handler activated with mode: {mode}")
366
+ start_time = time.time()
367
+
368
+ media = resolve_input(mode, img, imgs, vid, url)
369
+ if not media:
370
+ return None, format_error("No valid input provided. Please check your upload or URL."), None
371
+
372
+ results = []
373
+ for single_media in media:
374
+ if isinstance(single_media, str): # Video file
375
+ valid, err = validate_video(single_media)
376
+ if not valid:
377
+ return None, format_error(err), None
378
+ cap = cv2.VideoCapture(single_media)
379
+ ret, frame = cap.read()
380
+ cap.release()
381
+ if not ret:
382
+ return None, format_error("Failed to read video frame."), None
383
+ single_media = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
384
+
385
+ if isinstance(single_media, Image.Image):
386
+ valid, err = validate_image(single_media)
387
+ if not valid:
388
+ return None, format_error(err), None
389
+ try:
390
+ return process_image(single_media, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend)
391
+ except timeout_decorator.timeout_decorator.TimeoutError:
392
+ logger.error("Image processing timed out.")
393
+ return None, format_error("Processing timed out. Try a smaller image or simpler model."), None
394
+
395
+ logger.warning("Unsupported media type resolved.")
396
+ log_runtime(start_time)
397
+ return None, format_error("Invalid input. Please check your upload or URL."), None
398
+
399
+ # Gradio Interface
400
+ with gr.Blocks() as demo:
401
+ gr.Markdown("## Unified Visual Intelligence System (UVIS)")
402
+
403
+ # Input Mode Selection
404
+ mode = gr.Radio(["Upload", "URL"], value="Upload", label="Input Mode")
405
+ img = gr.Image(type="pil", label="Upload Image")
406
+ imgs = gr.Gallery(label="Upload Multiple Images (Up to 5)")
407
+ vid = gr.Video(label="Upload Video (<= 30s)")
408
+ url = gr.Textbox(label="URL (Image/Video)")
409
+
410
+ # Task Selection with parameters
411
+ with gr.Accordion("Object Detection Settings", open=False):
412
+ run_det = gr.Checkbox(label="Enable Object Detection")
413
+ det_model = gr.Dropdown(list(DETECTION_MODEL_MAP), label="Detection Model", visible=False)
414
+ det_confidence = gr.Slider(0.1, 1.0, 0.5, label="Detection Confidence Threshold", visible=False)
415
+
416
+ with gr.Accordion("Semantic Segmentation Settings", open=False):
417
+ run_seg = gr.Checkbox(label="Enable Segmentation")
418
+ seg_model = gr.Dropdown(list(SEGMENTATION_MODEL_MAP), label="Segmentation Model", visible=False)
419
+
420
+ with gr.Accordion("Depth Estimation Settings", open=False):
421
+ run_depth = gr.Checkbox(label="Enable Depth Estimation")
422
+ depth_model = gr.Dropdown(list(DEPTH_MODEL_MAP), label="Depth Model", visible=False)
423
+
424
+ blend = gr.Slider(0.0, 1.0, 0.5, label="Overlay Blend")
425
+
426
+ # Run Button
427
+ run = gr.Button("Run Analysis")
428
+
429
+ # Output Tabs
430
+ with gr.Tab("Scene JSON"):
431
+ json_out = gr.JSON()
432
+ with gr.Tab("Scene Blueprint"):
433
+ img_out = gr.Image()
434
+ with gr.Tab("Download"):
435
+ zip_out = gr.File()
436
+
437
+ # Attach Visibility Logic
438
+ run_det.change(toggle_visibility, run_det, [det_model, det_confidence])
439
+ run_seg.change(toggle_visibility, run_seg, [seg_model])
440
+ run_depth.change(toggle_visibility, run_depth, [depth_model])
441
+
442
+ # Button Click Event
443
+ run.click(
444
+ handle,
445
+ inputs=[mode, img, imgs, vid, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend],
446
+ outputs=[img_out, json_out, zip_out]
447
+ )
448
+
449
+ # Footer Section
450
+ gr.Markdown("---")
451
+ gr.Markdown(
452
+ """
453
+ <div style='text-align: center; font-size: 14px;'>
454
+ Built by <b>Durga Deepak Valluri</b><br>
455
+ <a href="https://github.com/DurgaDeepakValluri/UVIS" target="_blank">GitHub</a> |
456
+ <a href="https://deecoded.io" target="_blank">Website</a> |
457
+ <a href="https://www.linkedin.com/in/durga-deepak-valluri" target="_blank">LinkedIn</a>
458
+ </div>
459
+ """,
460
+ unsafe_allow_html=True
461
+ )
462
+
463
+ # Launch the Gradio App
464
  demo.launch()