Spaces:
Running
Running
import os, tempfile, time, traceback | |
from pathlib import Path | |
import gradio as gr | |
from groq import Groq | |
# Read secret from HF Spaces. Support both "groq_api_key" and "GROQ_API_KEY". | |
def _load_key() -> str: | |
key = os.environ.get("GROQ_API_KEY") or os.environ.get("groq_api_key") | |
if not key: | |
raise RuntimeError( | |
"Groq API key not found. In your Space settings -> Secrets, add 'groq_api_key'." | |
) | |
os.environ["GROQ_API_KEY"] = key | |
return key | |
client = Groq(api_key=_load_key()) | |
def transcribe_audio(audio_path: str, model: str = "whisper-large-v3") -> str: | |
if not audio_path or not Path(audio_path).exists(): | |
raise FileNotFoundError("Audio file path is missing or not found.") | |
with open(audio_path, "rb") as f: | |
resp = client.audio.transcriptions.create( | |
file=(Path(audio_path).name, f.read()), | |
model=model, | |
response_format="verbose_json", | |
) | |
return (getattr(resp, "text", "") or "").strip() | |
def stream_answer(prompt_text: str, | |
model: str = "llama-3.1-8b-instant", | |
temperature: float = 0.3): | |
if not prompt_text.strip(): | |
raise ValueError("Empty prompt for the LLM.") | |
stream = client.chat.completions.create( | |
model=model, | |
messages=[ | |
{"role": "system", "content": "Answer clearly and concisely."}, | |
{"role": "user", "content": prompt_text}, | |
], | |
temperature=temperature, | |
max_completion_tokens=1024, | |
top_p=1, | |
stream=True, | |
) | |
acc = [] | |
for chunk in stream: | |
delta = chunk.choices[0].delta.content or "" | |
if delta: | |
acc.append(delta) | |
yield "".join(acc) | |
yield "".join(acc) | |
def text_to_speech(text: str, | |
voice: str = "Calum-PlayAI", | |
model: str = "playai-tts", | |
fmt: str = "wav") -> str: | |
if not text.strip(): | |
raise ValueError("Empty text for TTS.") | |
tts_input = text[:1200] | |
resp = client.audio.speech.create( | |
model=model, | |
voice=voice, | |
response_format=fmt, | |
input=tts_input, | |
) | |
out_path = os.path.join(tempfile.gettempdir(), f"answer_{int(time.time())}.{fmt}") | |
# BinaryAPIResponse uses write_to_file in Groq SDK | |
resp.write_to_file(out_path) | |
return out_path | |
def run_pipeline(audio_file, typed_question, llm_model, voice_name): | |
transcript = "" | |
answer = "" | |
try: | |
if typed_question and typed_question.strip(): | |
transcript = typed_question.strip() | |
status = "Using typed question." | |
else: | |
if not audio_file: | |
raise RuntimeError("Provide a recording or type a question.") | |
status = "Transcribing audio..." | |
yield transcript, answer, None, status | |
transcript = transcribe_audio(audio_file) | |
if not transcript: | |
raise RuntimeError("No text returned by transcription.") | |
status = "Transcription done." | |
yield transcript, answer, None, status | |
status = "Generating answer..." | |
partial = "" | |
for partial in stream_answer(transcript, model=llm_model): | |
answer = partial | |
yield transcript, answer, None, status | |
if not answer.strip(): | |
raise RuntimeError("No text returned by the LLM.") | |
status = "Converting answer to speech..." | |
yield transcript, answer, None, status | |
audio_out = text_to_speech(answer, voice=voice_name) | |
status = "Done." | |
yield transcript, answer, audio_out, status | |
except Exception as e: | |
err = "Error: " + str(e) | |
short_tb = "\n".join(traceback.format_exc().splitlines()[-6:]) | |
help_tip = ( | |
"\nTips:\n" | |
"- Check Space secret 'groq_api_key'.\n" | |
"- Try a shorter audio clip.\n" | |
"- Verify model names.\n" | |
"- Confirm requirements installed." | |
) | |
yield transcript, answer, None, err + "\n" + short_tb + help_tip | |
with gr.Blocks(title="Audio Q&A with Groq") as demo: | |
gr.Markdown("# Audio Q&A with Groq") | |
gr.Markdown("One audio or typed question in, one answer out, plus speech.") | |
with gr.Row(): | |
audio_in = gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="Question audio" | |
) | |
typed_in = gr.Textbox(label="Or type your question", placeholder="Optional") | |
with gr.Row(): | |
llm_model = gr.Dropdown( | |
choices=[ | |
"llama-3.1-8b-instant", | |
"llama-3.1-70b-versatile", | |
"llama3-8b-8192", | |
], | |
value="llama-3.1-8b-instant", | |
label="LLM model" | |
) | |
voice_name = gr.Textbox(value="Calum-PlayAI", label="TTS voice") | |
ask_btn = gr.Button("Run") | |
clear_btn = gr.Button("Clear") | |
transcript_box = gr.Textbox(label="Transcription", interactive=False, lines=4) | |
answer_box = gr.Textbox(label="Answer", interactive=False, lines=10) | |
answer_audio = gr.Audio(label="Answer speech", interactive=False) | |
status_md = gr.Markdown("") | |
ask_btn.click( | |
fn=run_pipeline, | |
inputs=[audio_in, typed_in, llm_model, voice_name], | |
outputs=[transcript_box, answer_box, answer_audio, status_md] | |
) | |
def clear_all(): | |
return "", "", None, "" | |
clear_btn.click(fn=clear_all, inputs=None, outputs=[transcript_box, answer_box, answer_audio, status_md]) | |
if __name__ == "__main__": | |
# On HF Spaces you can simply do demo.launch() | |
# Queue enables generator streaming without extra args in Gradio v4 | |
demo.queue().launch() | |