import os, tempfile, time, traceback from pathlib import Path import gradio as gr from groq import Groq # Read secret from HF Spaces. Support both "groq_api_key" and "GROQ_API_KEY". def _load_key() -> str: key = os.environ.get("GROQ_API_KEY") or os.environ.get("groq_api_key") if not key: raise RuntimeError( "Groq API key not found. In your Space settings -> Secrets, add 'groq_api_key'." ) os.environ["GROQ_API_KEY"] = key return key client = Groq(api_key=_load_key()) def transcribe_audio(audio_path: str, model: str = "whisper-large-v3") -> str: if not audio_path or not Path(audio_path).exists(): raise FileNotFoundError("Audio file path is missing or not found.") with open(audio_path, "rb") as f: resp = client.audio.transcriptions.create( file=(Path(audio_path).name, f.read()), model=model, response_format="verbose_json", ) return (getattr(resp, "text", "") or "").strip() def stream_answer(prompt_text: str, model: str = "llama-3.1-8b-instant", temperature: float = 0.3): if not prompt_text.strip(): raise ValueError("Empty prompt for the LLM.") stream = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "Answer clearly and concisely."}, {"role": "user", "content": prompt_text}, ], temperature=temperature, max_completion_tokens=1024, top_p=1, stream=True, ) acc = [] for chunk in stream: delta = chunk.choices[0].delta.content or "" if delta: acc.append(delta) yield "".join(acc) yield "".join(acc) def text_to_speech(text: str, voice: str = "Calum-PlayAI", model: str = "playai-tts", fmt: str = "wav") -> str: if not text.strip(): raise ValueError("Empty text for TTS.") tts_input = text[:1200] resp = client.audio.speech.create( model=model, voice=voice, response_format=fmt, input=tts_input, ) out_path = os.path.join(tempfile.gettempdir(), f"answer_{int(time.time())}.{fmt}") # BinaryAPIResponse uses write_to_file in Groq SDK resp.write_to_file(out_path) return out_path def run_pipeline(audio_file, typed_question, llm_model, voice_name): transcript = "" answer = "" try: if typed_question and typed_question.strip(): transcript = typed_question.strip() status = "Using typed question." else: if not audio_file: raise RuntimeError("Provide a recording or type a question.") status = "Transcribing audio..." yield transcript, answer, None, status transcript = transcribe_audio(audio_file) if not transcript: raise RuntimeError("No text returned by transcription.") status = "Transcription done." yield transcript, answer, None, status status = "Generating answer..." partial = "" for partial in stream_answer(transcript, model=llm_model): answer = partial yield transcript, answer, None, status if not answer.strip(): raise RuntimeError("No text returned by the LLM.") status = "Converting answer to speech..." yield transcript, answer, None, status audio_out = text_to_speech(answer, voice=voice_name) status = "Done." yield transcript, answer, audio_out, status except Exception as e: err = "Error: " + str(e) short_tb = "\n".join(traceback.format_exc().splitlines()[-6:]) help_tip = ( "\nTips:\n" "- Check Space secret 'groq_api_key'.\n" "- Try a shorter audio clip.\n" "- Verify model names.\n" "- Confirm requirements installed." ) yield transcript, answer, None, err + "\n" + short_tb + help_tip with gr.Blocks(title="Audio Q&A with Groq") as demo: gr.Markdown("# Audio Q&A with Groq") gr.Markdown("One audio or typed question in, one answer out, plus speech.") with gr.Row(): audio_in = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Question audio" ) typed_in = gr.Textbox(label="Or type your question", placeholder="Optional") with gr.Row(): llm_model = gr.Dropdown( choices=[ "llama-3.1-8b-instant", "llama-3.1-70b-versatile", "llama3-8b-8192", ], value="llama-3.1-8b-instant", label="LLM model" ) voice_name = gr.Textbox(value="Calum-PlayAI", label="TTS voice") ask_btn = gr.Button("Run") clear_btn = gr.Button("Clear") transcript_box = gr.Textbox(label="Transcription", interactive=False, lines=4) answer_box = gr.Textbox(label="Answer", interactive=False, lines=10) answer_audio = gr.Audio(label="Answer speech", interactive=False) status_md = gr.Markdown("") ask_btn.click( fn=run_pipeline, inputs=[audio_in, typed_in, llm_model, voice_name], outputs=[transcript_box, answer_box, answer_audio, status_md] ) def clear_all(): return "", "", None, "" clear_btn.click(fn=clear_all, inputs=None, outputs=[transcript_box, answer_box, answer_audio, status_md]) if __name__ == "__main__": # On HF Spaces you can simply do demo.launch() # Queue enables generator streaming without extra args in Gradio v4 demo.queue().launch()