gpaasch commited on
Commit
ede3b41
·
1 Parent(s): 5758d49

microphone should now transcribe directly to text wihtout needing intermediate audio file processing

Browse files
Files changed (1) hide show
  1. src/app.py +44 -69
src/app.py CHANGED
@@ -169,62 +169,40 @@ or, if you have enough info, output a final JSON with fields:
169
  {"diagnoses":[…], "confidences":[…]}.
170
  """
171
 
172
- def process_speech(new_transcript, history):
173
- if not new_transcript:
174
- return history
175
-
176
- if not isinstance(new_transcript, str):
177
- print(f"Warning: Expected string transcript, got {type(new_transcript)}")
178
- new_transcript = str(new_transcript)
179
-
180
  try:
181
- # First, get potential diagnoses based on symptoms
 
 
 
 
 
 
182
  diagnosis_query = f"""
183
- Given these symptoms: '{new_transcript}'
184
 
185
  Identify the most likely ICD-10 diagnoses and key questions to differentiate between them.
186
  Focus only on symptoms mentioned and their clinical implications.
187
-
188
- Format response as:
189
- 1. Primary suspected diagnosis: [ICD-10 code] - [description]
190
- 2. Alternative diagnosis: [ICD-10 code] - [description]
191
- 3. Key differentiating question
192
  """
193
 
194
  response = symptom_index.as_query_engine().query(diagnosis_query)
195
 
196
- # Parse response into structured format
197
- lines = str(response).strip().split('\n')
198
- diagnoses = []
199
- follow_up = ""
200
-
201
- for line in lines:
202
- if '[' in line and ']' in line: # Extract ICD-10 codes
203
- code = line[line.find('[')+1:line.find(']')]
204
- diagnoses.append(code)
205
- elif 'Key differentiating question' in line:
206
- follow_up = line.split(':')[-1].strip()
207
-
208
  formatted_response = {
209
- "diagnoses": diagnoses[:2], # Top 2 diagnoses
210
- "confidences": [0.7, 0.3] if len(diagnoses) > 1 else [0.7], # Weighted confidences
211
- "follow_up": follow_up if follow_up else "What other symptoms are you experiencing?"
212
  }
213
 
214
- history.append({"role": "user", "content": new_transcript})
215
- history.append({"role": "assistant", "content": json.dumps(formatted_response, indent=2)})
 
 
216
 
217
  except Exception as e:
218
- print(f"Error processing speech: {str(e)}")
219
- error_response = {
220
- "diagnoses": ["Error processing symptoms"],
221
- "confidences": [0],
222
- "follow_up": "Could you please repeat your symptoms?"
223
- }
224
- history.append({"role": "user", "content": new_transcript})
225
- history.append({"role": "assistant", "content": json.dumps(error_response, indent=2)})
226
-
227
- return history
228
 
229
  def text_to_speech(text):
230
  """Convert text to speech and return audio HTML element."""
@@ -270,10 +248,16 @@ with gr.Blocks() as demo:
270
  with gr.Column(scale=2):
271
  # Moved microphone row above chatbot
272
  with gr.Row():
273
- microphone = gr.Microphone(
 
 
274
  label="Describe your symptoms",
275
- streaming=True,
276
- type="filepath"
 
 
 
 
277
  )
278
  clear_btn = gr.Button("Clear Chat", variant="secondary")
279
 
@@ -308,39 +292,30 @@ with gr.Blocks() as demo:
308
  clear_btn.click(lambda: None, None, chatbot, queue=False)
309
 
310
  def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
311
- """Process speech input and return formatted chat messages."""
312
  if not audio_path:
313
  return history
314
 
315
- # First get the raw transcript and response
316
- transcript = process_speech(audio_path, []) # Start fresh history
317
- if not transcript or len(transcript) < 2:
318
- return history + [
319
- {"role": "user", "content": "Audio recording"},
320
- {"role": "assistant", "content": "I couldn't process that audio. Could you try again?"}
321
- ]
322
-
323
- try:
324
- # Get the last assistant response
325
- user_message = transcript[-2]["content"] # What the user said
326
- assistant_json = transcript[-1]["content"] # JSON response from assistant
327
 
328
- # Parse and format the assistant's response
329
- response_dict = json.loads(assistant_json)
330
- formatted_response = format_response_for_user(response_dict)
 
 
331
 
332
- # Add the exchange to history in the correct message format
333
  return history + [
334
- {"role": "user", "content": user_message},
335
- {"role": "assistant", "content": formatted_response}
336
  ]
337
 
338
  except Exception as e:
339
- print(f"Error formatting chat response: {e}")
340
- return history + [
341
- {"role": "user", "content": "Error processing audio"},
342
- {"role": "assistant", "content": "Sorry, I encountered an error processing your symptoms. Could you try again?"}
343
- ]
344
 
345
  microphone.stream(
346
  fn=enhanced_process_speech,
 
169
  {"diagnoses":[…], "confidences":[…]}.
170
  """
171
 
172
+ def process_speech(audio_path, history):
173
+ """Process speech input and convert to text."""
 
 
 
 
 
 
174
  try:
175
+ if not audio_path:
176
+ return []
177
+
178
+ # The audio_path now contains the transcribed text directly from Gradio
179
+ transcript = audio_path
180
+
181
+ # Query the symptom index
182
  diagnosis_query = f"""
183
+ Given these symptoms: '{transcript}'
184
 
185
  Identify the most likely ICD-10 diagnoses and key questions to differentiate between them.
186
  Focus only on symptoms mentioned and their clinical implications.
 
 
 
 
 
187
  """
188
 
189
  response = symptom_index.as_query_engine().query(diagnosis_query)
190
 
191
+ # Format response
 
 
 
 
 
 
 
 
 
 
 
192
  formatted_response = {
193
+ "diagnoses": [],
194
+ "confidences": [],
195
+ "follow_up": str(response)
196
  }
197
 
198
+ return [
199
+ {"role": "user", "content": transcript},
200
+ {"role": "assistant", "content": json.dumps(formatted_response)}
201
+ ]
202
 
203
  except Exception as e:
204
+ print(f"Error processing speech: {e}")
205
+ return []
 
 
 
 
 
 
 
 
206
 
207
  def text_to_speech(text):
208
  """Convert text to speech and return audio HTML element."""
 
248
  with gr.Column(scale=2):
249
  # Moved microphone row above chatbot
250
  with gr.Row():
251
+ microphone = gr.Audio(
252
+ source="microphone",
253
+ type="text", # Changed from filepath to text
254
  label="Describe your symptoms",
255
+ streaming=True
256
+ )
257
+ transcript_box = gr.Textbox(
258
+ label="Transcribed Text",
259
+ interactive=False,
260
+ show_label=True
261
  )
262
  clear_btn = gr.Button("Clear Chat", variant="secondary")
263
 
 
292
  clear_btn.click(lambda: None, None, chatbot, queue=False)
293
 
294
  def enhanced_process_speech(audio_path, history, api_key=None, model_tier="small", temp=0.7):
295
+ """Handle speech processing and chat formatting."""
296
  if not audio_path:
297
  return history
298
 
299
+ # Process the new audio input
300
+ new_messages = process_speech(audio_path, history)
301
+ if not new_messages:
302
+ return history
 
 
 
 
 
 
 
 
303
 
304
+ try:
305
+ # Format last assistant response
306
+ assistant_response = new_messages[-1]["content"]
307
+ response_dict = json.loads(assistant_response)
308
+ formatted_text = format_response_for_user(response_dict)
309
 
310
+ # Add to history with proper message format
311
  return history + [
312
+ {"role": "user", "content": new_messages[0]["content"]},
313
+ {"role": "assistant", "content": formatted_text}
314
  ]
315
 
316
  except Exception as e:
317
+ print(f"Error formatting response: {e}")
318
+ return history
 
 
 
319
 
320
  microphone.stream(
321
  fn=enhanced_process_speech,