Spaces:
Running
Running
microphone should now transcribe directly to text wihtout needing intermediate audio file processing
Browse files- src/app.py +44 -69
src/app.py
CHANGED
@@ -169,62 +169,40 @@ or, if you have enough info, output a final JSON with fields:
|
|
169 |
{"diagnoses":[…], "confidences":[…]}.
|
170 |
"""
|
171 |
|
172 |
-
def process_speech(
|
173 |
-
|
174 |
-
return history
|
175 |
-
|
176 |
-
if not isinstance(new_transcript, str):
|
177 |
-
print(f"Warning: Expected string transcript, got {type(new_transcript)}")
|
178 |
-
new_transcript = str(new_transcript)
|
179 |
-
|
180 |
try:
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
diagnosis_query = f"""
|
183 |
-
Given these symptoms: '{
|
184 |
|
185 |
Identify the most likely ICD-10 diagnoses and key questions to differentiate between them.
|
186 |
Focus only on symptoms mentioned and their clinical implications.
|
187 |
-
|
188 |
-
Format response as:
|
189 |
-
1. Primary suspected diagnosis: [ICD-10 code] - [description]
|
190 |
-
2. Alternative diagnosis: [ICD-10 code] - [description]
|
191 |
-
3. Key differentiating question
|
192 |
"""
|
193 |
|
194 |
response = symptom_index.as_query_engine().query(diagnosis_query)
|
195 |
|
196 |
-
#
|
197 |
-
lines = str(response).strip().split('\n')
|
198 |
-
diagnoses = []
|
199 |
-
follow_up = ""
|
200 |
-
|
201 |
-
for line in lines:
|
202 |
-
if '[' in line and ']' in line: # Extract ICD-10 codes
|
203 |
-
code = line[line.find('[')+1:line.find(']')]
|
204 |
-
diagnoses.append(code)
|
205 |
-
elif 'Key differentiating question' in line:
|
206 |
-
follow_up = line.split(':')[-1].strip()
|
207 |
-
|
208 |
formatted_response = {
|
209 |
-
"diagnoses":
|
210 |
-
"confidences": [
|
211 |
-
"follow_up":
|
212 |
}
|
213 |
|
214 |
-
|
215 |
-
|
|
|
|
|
216 |
|
217 |
except Exception as e:
|
218 |
-
print(f"Error processing speech: {
|
219 |
-
|
220 |
-
"diagnoses": ["Error processing symptoms"],
|
221 |
-
"confidences": [0],
|
222 |
-
"follow_up": "Could you please repeat your symptoms?"
|
223 |
-
}
|
224 |
-
history.append({"role": "user", "content": new_transcript})
|
225 |
-
history.append({"role": "assistant", "content": json.dumps(error_response, indent=2)})
|
226 |
-
|
227 |
-
return history
|
228 |
|
229 |
def text_to_speech(text):
|
230 |
"""Convert text to speech and return audio HTML element."""
|
@@ -270,10 +248,16 @@ with gr.Blocks() as demo:
|
|
270 |
with gr.Column(scale=2):
|
271 |
# Moved microphone row above chatbot
|
272 |
with gr.Row():
|
273 |
-
microphone = gr.
|
|
|
|
|
274 |
label="Describe your symptoms",
|
275 |
-
streaming=True
|
276 |
-
|
|
|
|
|
|
|
|
|
277 |
)
|
278 |
clear_btn = gr.Button("Clear Chat", variant="secondary")
|
279 |
|
@@ -308,39 +292,30 @@ with gr.Blocks() as demo:
|
|
308 |
clear_btn.click(lambda: None, None, chatbot, queue=False)
|
309 |
|
310 |
def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
|
311 |
-
"""
|
312 |
if not audio_path:
|
313 |
return history
|
314 |
|
315 |
-
#
|
316 |
-
|
317 |
-
if not
|
318 |
-
return history
|
319 |
-
{"role": "user", "content": "Audio recording"},
|
320 |
-
{"role": "assistant", "content": "I couldn't process that audio. Could you try again?"}
|
321 |
-
]
|
322 |
-
|
323 |
-
try:
|
324 |
-
# Get the last assistant response
|
325 |
-
user_message = transcript[-2]["content"] # What the user said
|
326 |
-
assistant_json = transcript[-1]["content"] # JSON response from assistant
|
327 |
|
328 |
-
|
329 |
-
|
330 |
-
|
|
|
|
|
331 |
|
332 |
-
# Add
|
333 |
return history + [
|
334 |
-
{"role": "user", "content":
|
335 |
-
{"role": "assistant", "content":
|
336 |
]
|
337 |
|
338 |
except Exception as e:
|
339 |
-
print(f"Error formatting
|
340 |
-
return history
|
341 |
-
{"role": "user", "content": "Error processing audio"},
|
342 |
-
{"role": "assistant", "content": "Sorry, I encountered an error processing your symptoms. Could you try again?"}
|
343 |
-
]
|
344 |
|
345 |
microphone.stream(
|
346 |
fn=enhanced_process_speech,
|
|
|
169 |
{"diagnoses":[…], "confidences":[…]}.
|
170 |
"""
|
171 |
|
172 |
+
def process_speech(audio_path, history):
|
173 |
+
"""Process speech input and convert to text."""
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
try:
|
175 |
+
if not audio_path:
|
176 |
+
return []
|
177 |
+
|
178 |
+
# The audio_path now contains the transcribed text directly from Gradio
|
179 |
+
transcript = audio_path
|
180 |
+
|
181 |
+
# Query the symptom index
|
182 |
diagnosis_query = f"""
|
183 |
+
Given these symptoms: '{transcript}'
|
184 |
|
185 |
Identify the most likely ICD-10 diagnoses and key questions to differentiate between them.
|
186 |
Focus only on symptoms mentioned and their clinical implications.
|
|
|
|
|
|
|
|
|
|
|
187 |
"""
|
188 |
|
189 |
response = symptom_index.as_query_engine().query(diagnosis_query)
|
190 |
|
191 |
+
# Format response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
formatted_response = {
|
193 |
+
"diagnoses": [],
|
194 |
+
"confidences": [],
|
195 |
+
"follow_up": str(response)
|
196 |
}
|
197 |
|
198 |
+
return [
|
199 |
+
{"role": "user", "content": transcript},
|
200 |
+
{"role": "assistant", "content": json.dumps(formatted_response)}
|
201 |
+
]
|
202 |
|
203 |
except Exception as e:
|
204 |
+
print(f"Error processing speech: {e}")
|
205 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
def text_to_speech(text):
|
208 |
"""Convert text to speech and return audio HTML element."""
|
|
|
248 |
with gr.Column(scale=2):
|
249 |
# Moved microphone row above chatbot
|
250 |
with gr.Row():
|
251 |
+
microphone = gr.Audio(
|
252 |
+
source="microphone",
|
253 |
+
type="text", # Changed from filepath to text
|
254 |
label="Describe your symptoms",
|
255 |
+
streaming=True
|
256 |
+
)
|
257 |
+
transcript_box = gr.Textbox(
|
258 |
+
label="Transcribed Text",
|
259 |
+
interactive=False,
|
260 |
+
show_label=True
|
261 |
)
|
262 |
clear_btn = gr.Button("Clear Chat", variant="secondary")
|
263 |
|
|
|
292 |
clear_btn.click(lambda: None, None, chatbot, queue=False)
|
293 |
|
294 |
def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
|
295 |
+
"""Handle speech processing and chat formatting."""
|
296 |
if not audio_path:
|
297 |
return history
|
298 |
|
299 |
+
# Process the new audio input
|
300 |
+
new_messages = process_speech(audio_path, history)
|
301 |
+
if not new_messages:
|
302 |
+
return history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
+
try:
|
305 |
+
# Format last assistant response
|
306 |
+
assistant_response = new_messages[-1]["content"]
|
307 |
+
response_dict = json.loads(assistant_response)
|
308 |
+
formatted_text = format_response_for_user(response_dict)
|
309 |
|
310 |
+
# Add to history with proper message format
|
311 |
return history + [
|
312 |
+
{"role": "user", "content": new_messages[0]["content"]},
|
313 |
+
{"role": "assistant", "content": formatted_text}
|
314 |
]
|
315 |
|
316 |
except Exception as e:
|
317 |
+
print(f"Error formatting response: {e}")
|
318 |
+
return history
|
|
|
|
|
|
|
319 |
|
320 |
microphone.stream(
|
321 |
fn=enhanced_process_speech,
|