tonyliu404 commited on
Commit
7ea89c4
·
verified ·
1 Parent(s): 9814196

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -220
app.py CHANGED
@@ -1,220 +1,105 @@
1
- import streamlit as st
2
- import cv2
3
- import tempfile
4
- import numpy as np
5
- import torch
6
- import torch.nn.functional as F
7
- import os
8
- from PIL import Image
9
- import time
10
- import requests
11
- import json
12
- import base64
13
- import threading
14
- import pygame
15
- import queue
16
-
17
- backend_server_url = "https://0416-2600-1017-a410-36b8-2357-52be-1318-959b.ngrok-free.app"
18
-
19
- response_queue = queue.Queue() #For thread-safe communication between threads
20
-
21
- # To keep track of ongoing threads
22
- send_thread = None
23
-
24
- # def playAudio(audio_base64):
25
- # # Decode the base64 string into bytes
26
- # audio_bytes = base64.b64decode(audio_base64)
27
-
28
- # # Save to a file
29
- # audio_path = "output_audio.wav"
30
- # with open(audio_path, "wb") as f:
31
- # f.write(audio_bytes)
32
-
33
- # # Display audio player in Streamlit
34
- # st.audio(audio_bytes, format="audio/wav")
35
-
36
- def playAudio(audio_base64):
37
- """
38
- Play audio file using pygame mixer.
39
-
40
- Args:
41
- audio_path: Path to audio file
42
- """
43
- # Decode the base64 string into bytes
44
- audio_bytes = base64.b64decode(audio_base64)
45
-
46
- try:
47
- # Write bytes to a temporary WAV file
48
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
49
- temp_audio.write(audio_bytes)
50
- temp_audio_path = temp_audio.name
51
-
52
- # Initialize pygame mixer
53
- pygame.mixer.init()
54
- pygame.mixer.music.load(temp_audio_path)
55
- pygame.mixer.music.play()
56
-
57
- # Wait for playback to finish
58
- while pygame.mixer.music.get_busy():
59
- pygame.time.Clock().tick(10)
60
-
61
- except Exception as e:
62
- print(f"Error playing audio: {e}")
63
-
64
- def sendToBackend(frame, audio = None):
65
- try:
66
- # Save current frame to disk
67
- cv2.imwrite("frame.jpg", frame)
68
-
69
- # Create an empty audio file (1 second of silence if needed)
70
- empty_audio_path = "input.mp3"
71
- if not os.path.exists(empty_audio_path):
72
- with open(empty_audio_path, "wb") as f:
73
- f.write(b"")
74
-
75
- with open("frame.jpg", "rb") as img, open("input.mp3", "rb") as audio:
76
- files = {
77
- "image": ("frame.jpg", img, "image/jpeg"),
78
- "audio": ("input.mp3", audio, "audio/mpeg")
79
- }
80
- # Send the request to the backend server
81
- #response = requests.post("http://localhost:8000/process/", files=files)
82
- response = requests.post(backend_server_url + "/process/", files=files)
83
-
84
- if response.status_code == 200: #If the request was successful
85
- st.success("Frame sent successfully!")
86
- response_queue.put(response.json())
87
- else:
88
- st.error(f"Failed: {response.status_code} - {response.text}")
89
- except Exception as e:
90
- st.error(f"Error sending frame: {e}")
91
-
92
-
93
- def thread_sendToBackend():
94
- global send_thread
95
- if not st.session_state.paused and send_thread is None:
96
- send_thread = threading.Thread(target=sendToBackend, args=(frame,), daemon=True)
97
- send_thread.start()
98
-
99
- # Setup
100
- if "last_frame" not in st.session_state:
101
- st.session_state.last_frame = None
102
- if "paused" not in st.session_state:
103
- st.session_state.paused = False
104
-
105
- # if not st.session_state.paused:
106
- # threading.Thread(target=sendToBackend, args=(st.session_state.last_frame,), daemon=True).start()
107
-
108
-
109
- st.title("SpokenVision - Real-time Object Detection and Segmentation")
110
-
111
- def toggle_pause():
112
- st.session_state.paused = not st.session_state.paused
113
-
114
- col1, col2 = st.columns(2)
115
- with col1:
116
- st.button("Start" if st.session_state.paused else "Pause", on_click=toggle_pause)
117
-
118
- def main():
119
-
120
- # Camera selection options
121
- camera_options = {
122
- "Built-in Camera": 0,
123
- "External Camera": 1,
124
- "Mobile Phone Camera (requires IP Webcam app)": "http://YOUR_PHONE_IP:8080/video"
125
- }
126
-
127
- camera_choice = st.selectbox(
128
- "Choose Camera Source",
129
- list(camera_options.keys()),
130
- index=0
131
- )
132
-
133
- # Instructions for phone camera
134
- if "Mobile" in camera_choice:
135
- st.info("""
136
- To use your phone as a camera:
137
- 1. Install the 'IP Webcam' app from Play Store (Android) or similar app for iOS
138
- 2. Open the app and click 'Start server'
139
- 3. Replace 'YOUR_PHONE_IP' in the code with your phone's IP address shown in the app
140
- 4. Make sure your phone and computer are on the same network
141
- """)
142
-
143
- # Camera stream capture
144
- stframe = st.empty()
145
- camera_source = camera_options[camera_choice]
146
-
147
- # Setup capture once and keep it in session
148
- if 'cap' not in st.session_state:
149
- cap = cv2.VideoCapture(camera_source)
150
- if not cap.isOpened():
151
- st.error(f"Could not open camera {camera_source}")
152
- else:
153
- st.session_state.cap = cap
154
- st.session_state.streaming = True
155
- st.session_state.paused = False
156
-
157
- threading.Thread(target=sendToBackend, args=(None,), daemon=True).start()
158
- else:
159
- cap = st.session_state.cap
160
-
161
- # Start camera feed
162
- try:
163
- if not cap.isOpened():
164
- st.error(f"Could not open camera {camera_source}")
165
- return
166
-
167
- while True:
168
-
169
- # # Always check and render server response
170
- if not response_queue.empty():
171
- response = response_queue.get()
172
- st.markdown("### Server Response")
173
-
174
- if "caption" in response:
175
- st.write("Caption:", response["caption"])
176
-
177
- if "audio_base64" in response:
178
- threading.Thread(target=playAudio, args=(response["audio_base64"],), daemon=True).start()
179
- if not st.session_state.paused:
180
- threading.Thread(target=sendToBackend, args=(st.session_state.last_frame,), daemon=True).start() #auto send after receiving audio
181
-
182
- if not st.session_state.paused:
183
- ret, frame = cap.read()
184
- if not ret:
185
- st.error("Failed to grab frame from camera")
186
- break
187
- st.session_state.last_frame = frame # Save last good frame
188
- else:
189
- frame = st.session_state.get('last_frame', None)
190
- if frame is None:
191
- time.sleep(0.05)
192
- continue
193
-
194
- # Convert BGR to RGB
195
- rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
196
-
197
- # Display frame in Streamlit
198
- stframe.image(rgb_frame, channels="RGB", use_container_width=True)
199
-
200
- # Start sending frame to backend only if no other thread is running
201
- global send_thread
202
- if not st.session_state.paused and send_thread is None:
203
- send_thread = threading.Thread(target=sendToBackend, args=(frame,), daemon=True)
204
- send_thread.start()
205
-
206
- # Small delay to prevent high CPU usage
207
- time.sleep(0.05)
208
-
209
- except Exception as e:
210
- st.error(f"Error: {e}")
211
-
212
- finally:
213
- # Release resources on exit
214
- if 'cap' in st.session_state:
215
- st.session_state.cap.release()
216
- st.session_state.running = False
217
- st.stop()
218
-
219
- if __name__ == "__main__":
220
- main()
 
1
+ import gradio as gr
2
+ import cv2
3
+ import tempfile
4
+ import numpy as np
5
+ import os
6
+ import time
7
+ import requests
8
+ import base64
9
+ import threading
10
+ import pygame
11
+
12
+ # Backend server URL
13
+ backend_server_url = "https://0416-2600-1017-a410-36b8-2357-52be-1318-959b.ngrok-free.app"
14
+
15
+ send_thread = None # To keep track of ongoing threads
16
+
17
+ # Audio playback
18
+ def play_audio(audio_base64):
19
+ """
20
+ Play audio file using pygame mixer.
21
+
22
+ Args:
23
+ audio_path: Path to audio file
24
+ """
25
+ audio_bytes = base64.b64decode(audio_base64)
26
+ try:
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
28
+ temp_audio.write(audio_bytes)
29
+ temp_audio_path = temp_audio.name
30
+
31
+ pygame.mixer.init()
32
+ pygame.mixer.music.load(temp_audio_path)
33
+ pygame.mixer.music.play()
34
+ while pygame.mixer.music.get_busy():
35
+ pygame.time.Clock().tick(10)
36
+ except Exception as e:
37
+ print(f"Audio error: {e}")
38
+
39
+ # Backend interaction
40
+ def send_to_backend(frame):
41
+ try:
42
+ # _, img_encoded = cv2.imencode('.jpg', frame)
43
+ # img_bytes = img_encoded.tobytes()
44
+ small_frame = cv2.resize(frame, (224, 224))
45
+ # Save current frame to disk
46
+ cv2.imwrite("frame.jpg", small_frame)
47
+
48
+ # Ensure dummy audio file exists
49
+ empty_audio_path = "input.mp3"
50
+ if not os.path.exists(empty_audio_path):
51
+ with open(empty_audio_path, "wb") as f:
52
+ f.write(b"")
53
+
54
+ with open("frame.jpg", "rb") as img, open("input.mp3", "rb") as audio:
55
+ files = {
56
+ "image": ("frame.jpg", img, "image/jpeg"),
57
+ "audio": ("input.mp3", audio, "audio/mpeg")
58
+ }
59
+ response = requests.post(backend_server_url + "/process/", files=files)
60
+ if response.status_code == 200:
61
+ return response.json()
62
+ else:
63
+ return {"error": f"Backend error {response.status_code}: {response.text}"}
64
+ except Exception as e:
65
+ return {"error": f"Exception: {str(e)}"}
66
+
67
+ def thread_sendToBackend(frame):
68
+ """ Starts a thread to send the frame to the backend. """
69
+ global send_thread
70
+ if send_thread is None:
71
+ send_thread = threading.Thread(target=send_to_backend, args=(frame,), daemon=True)
72
+ send_thread.start()
73
+
74
+
75
+ # # Gradio processing function
76
+ def process_webcam(image):
77
+ if image is None:
78
+ return None, "No frame", None
79
+
80
+ frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
81
+ result = send_to_backend(frame)
82
+ print(len(result))
83
+ caption = result['caption']
84
+ audio_base64 = result['audio_base64']
85
+
86
+ if audio_base64:
87
+ threading.Thread(target=play_audio, args=(audio_base64,), daemon=True).start()
88
+
89
+ return caption
90
+
91
+
92
+ # Gradio interface
93
+ demo = gr.Interface(
94
+ fn=process_webcam,
95
+ inputs=gr.Image(sources=["upload", "webcam"]),
96
+ outputs=[
97
+ gr.Textbox(label="Caption"),
98
+ ],
99
+ live=True,
100
+ title="SpokenVision",
101
+ description="Real-time object detection and captioning with audio feedback",
102
+ allow_flagging="never"
103
+ )
104
+
105
+ demo.launch()