audio-to-audio / app.py
ibrahim313's picture
Update app.py
f872fa9 verified
import os, tempfile, time, traceback
from pathlib import Path
import gradio as gr
from groq import Groq
# Read secret from HF Spaces. Support both "groq_api_key" and "GROQ_API_KEY".
def _load_key() -> str:
key = os.environ.get("GROQ_API_KEY") or os.environ.get("groq_api_key")
if not key:
raise RuntimeError(
"Groq API key not found. In your Space settings -> Secrets, add 'groq_api_key'."
)
os.environ["GROQ_API_KEY"] = key
return key
client = Groq(api_key=_load_key())
def transcribe_audio(audio_path: str, model: str = "whisper-large-v3") -> str:
if not audio_path or not Path(audio_path).exists():
raise FileNotFoundError("Audio file path is missing or not found.")
with open(audio_path, "rb") as f:
resp = client.audio.transcriptions.create(
file=(Path(audio_path).name, f.read()),
model=model,
response_format="verbose_json",
)
return (getattr(resp, "text", "") or "").strip()
def stream_answer(prompt_text: str,
model: str = "llama-3.1-8b-instant",
temperature: float = 0.3):
if not prompt_text.strip():
raise ValueError("Empty prompt for the LLM.")
stream = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "Answer clearly and concisely."},
{"role": "user", "content": prompt_text},
],
temperature=temperature,
max_completion_tokens=1024,
top_p=1,
stream=True,
)
acc = []
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
if delta:
acc.append(delta)
yield "".join(acc)
yield "".join(acc)
def text_to_speech(text: str,
voice: str = "Calum-PlayAI",
model: str = "playai-tts",
fmt: str = "wav") -> str:
if not text.strip():
raise ValueError("Empty text for TTS.")
tts_input = text[:1200]
resp = client.audio.speech.create(
model=model,
voice=voice,
response_format=fmt,
input=tts_input,
)
out_path = os.path.join(tempfile.gettempdir(), f"answer_{int(time.time())}.{fmt}")
# BinaryAPIResponse uses write_to_file in Groq SDK
resp.write_to_file(out_path)
return out_path
def run_pipeline(audio_file, typed_question, llm_model, voice_name):
transcript = ""
answer = ""
try:
if typed_question and typed_question.strip():
transcript = typed_question.strip()
status = "Using typed question."
else:
if not audio_file:
raise RuntimeError("Provide a recording or type a question.")
status = "Transcribing audio..."
yield transcript, answer, None, status
transcript = transcribe_audio(audio_file)
if not transcript:
raise RuntimeError("No text returned by transcription.")
status = "Transcription done."
yield transcript, answer, None, status
status = "Generating answer..."
partial = ""
for partial in stream_answer(transcript, model=llm_model):
answer = partial
yield transcript, answer, None, status
if not answer.strip():
raise RuntimeError("No text returned by the LLM.")
status = "Converting answer to speech..."
yield transcript, answer, None, status
audio_out = text_to_speech(answer, voice=voice_name)
status = "Done."
yield transcript, answer, audio_out, status
except Exception as e:
err = "Error: " + str(e)
short_tb = "\n".join(traceback.format_exc().splitlines()[-6:])
help_tip = (
"\nTips:\n"
"- Check Space secret 'groq_api_key'.\n"
"- Try a shorter audio clip.\n"
"- Verify model names.\n"
"- Confirm requirements installed."
)
yield transcript, answer, None, err + "\n" + short_tb + help_tip
with gr.Blocks(title="Audio Q&A with Groq") as demo:
gr.Markdown("# Audio Q&A with Groq")
gr.Markdown("One audio or typed question in, one answer out, plus speech.")
with gr.Row():
audio_in = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Question audio"
)
typed_in = gr.Textbox(label="Or type your question", placeholder="Optional")
with gr.Row():
llm_model = gr.Dropdown(
choices=[
"llama-3.1-8b-instant",
"llama-3.1-70b-versatile",
"llama3-8b-8192",
],
value="llama-3.1-8b-instant",
label="LLM model"
)
voice_name = gr.Textbox(value="Calum-PlayAI", label="TTS voice")
ask_btn = gr.Button("Run")
clear_btn = gr.Button("Clear")
transcript_box = gr.Textbox(label="Transcription", interactive=False, lines=4)
answer_box = gr.Textbox(label="Answer", interactive=False, lines=10)
answer_audio = gr.Audio(label="Answer speech", interactive=False)
status_md = gr.Markdown("")
ask_btn.click(
fn=run_pipeline,
inputs=[audio_in, typed_in, llm_model, voice_name],
outputs=[transcript_box, answer_box, answer_audio, status_md]
)
def clear_all():
return "", "", None, ""
clear_btn.click(fn=clear_all, inputs=None, outputs=[transcript_box, answer_box, answer_audio, status_md])
if __name__ == "__main__":
# On HF Spaces you can simply do demo.launch()
# Queue enables generator streaming without extra args in Gradio v4
demo.queue().launch()