Spaces:
Running
Running
File size: 2,564 Bytes
69a9db6 faa30e5 389cdcb 69a9db6 389cdcb 69a9db6 faa30e5 703b224 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 17d4419 69a9db6 5715904 faa30e5 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 62a55fc faa30e5 389cdcb 62a55fc 389cdcb cf360b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
import time
import requests
import gradio as gr
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()
image_API="https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
translation_API="https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-1.3B"
whisper_API="https://api-inference.huggingface.co/models/openai/whisper-large-v3"
HF_TOKEN=os.getenv("HF_TOKEN")
login(HF_TOKEN)
headers={"Authorization":f"Bearer {HF_TOKEN}"}
def query_whisper(audio_path):
with open(audio_path,"rb") as audio:
data=audio.read()
response=requests.post(whisper_API,headers=headers,data=data)
return response.json()
def query_translation(text):
max_retries=5
delay=10
payload={"inputs":text,"parameters":{"src_lang":"ta_Taml","tgt_lang":'eng_Latn'}}
for attempt in range(max_retries):
response=requests.post(translation_API,headers=headers,json=payload)
result= response.json()
if isinstance(result,list) and len(result)>0:
return {"translated_text":result[0]["translation_text"]}
elif isinstance(result,dict) and "translation_text" in result:
return {"translated_text":result["translation_text"]}
time.sleep(delay)
return {"error":"Translation API failed","response":result}
def query_image(prompt):
max_retries=5
delay=10
payload={"inputs":prompt}
for attempt in range(max_retries):
response=requests.post(image_API,headers=headers,json=payload)
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
image_path='generated_image.png'
with open(image_path,'wb') as f:
f.write(response.content)
return image_path
time.sleep(delay)
return None
def process_audio(audio_path):
if not audio_path:
return None
try:
transcription=query_whisper(audio_path)
tamil_text=transcription.get("text","Transcription error")
translation=query_translation(tamil_text)
translated_text=translation.get("translated_text","Translation error")
image_path=query_image(translated_text)
return tamil_text,translated_text,image_path
except Exception as e:
return None,str(e),None
iface=gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath",label="Upload Audio"),
outputs=[
gr.Textbox(label="Tamil Text"),
gr.Textbox(label="English Translation"),
gr.Image(label="Generated Image")
],
title="Speech-to-Image Generation"
)
iface.launch(share=True,server_name="0.0.0.0",server_port=None) |