Audio_to_image / app.py
sk007msd's picture
Update app.py
703b224 verified
import os
import time
import requests
import gradio as gr
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()
image_API="https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
translation_API="https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-1.3B"
whisper_API="https://api-inference.huggingface.co/models/openai/whisper-large-v3"
HF_TOKEN=os.getenv("HF_TOKEN")
login(HF_TOKEN)
headers={"Authorization":f"Bearer {HF_TOKEN}"}
def query_whisper(audio_path):
with open(audio_path,"rb") as audio:
data=audio.read()
response=requests.post(whisper_API,headers=headers,data=data)
return response.json()
def query_translation(text):
max_retries=5
delay=10
payload={"inputs":text,"parameters":{"src_lang":"ta_Taml","tgt_lang":'eng_Latn'}}
for attempt in range(max_retries):
response=requests.post(translation_API,headers=headers,json=payload)
result= response.json()
if isinstance(result,list) and len(result)>0:
return {"translated_text":result[0]["translation_text"]}
elif isinstance(result,dict) and "translation_text" in result:
return {"translated_text":result["translation_text"]}
time.sleep(delay)
return {"error":"Translation API failed","response":result}
def query_image(prompt):
max_retries=5
delay=10
payload={"inputs":prompt}
for attempt in range(max_retries):
response=requests.post(image_API,headers=headers,json=payload)
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
image_path='generated_image.png'
with open(image_path,'wb') as f:
f.write(response.content)
return image_path
time.sleep(delay)
return None
def process_audio(audio_path):
if not audio_path:
return None
try:
transcription=query_whisper(audio_path)
tamil_text=transcription.get("text","Transcription error")
translation=query_translation(tamil_text)
translated_text=translation.get("translated_text","Translation error")
image_path=query_image(translated_text)
return tamil_text,translated_text,image_path
except Exception as e:
return None,str(e),None
iface=gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath",label="Upload Audio"),
outputs=[
gr.Textbox(label="Tamil Text"),
gr.Textbox(label="English Translation"),
gr.Image(label="Generated Image")
],
title="Speech-to-Image Generation"
)
iface.launch(share=True,server_name="0.0.0.0",server_port=None)