import os import time import requests import gradio as gr from huggingface_hub import login from dotenv import load_dotenv load_dotenv() image_API="https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2" translation_API="https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-1.3B" whisper_API="https://api-inference.huggingface.co/models/openai/whisper-large-v3" HF_TOKEN=os.getenv("HF_TOKEN") login(HF_TOKEN) headers={"Authorization":f"Bearer {HF_TOKEN}"} def query_whisper(audio_path): with open(audio_path,"rb") as audio: data=audio.read() response=requests.post(whisper_API,headers=headers,data=data) return response.json() def query_translation(text): max_retries=5 delay=10 payload={"inputs":text,"parameters":{"src_lang":"ta_Taml","tgt_lang":'eng_Latn'}} for attempt in range(max_retries): response=requests.post(translation_API,headers=headers,json=payload) result= response.json() if isinstance(result,list) and len(result)>0: return {"translated_text":result[0]["translation_text"]} elif isinstance(result,dict) and "translation_text" in result: return {"translated_text":result["translation_text"]} time.sleep(delay) return {"error":"Translation API failed","response":result} def query_image(prompt): max_retries=5 delay=10 payload={"inputs":prompt} for attempt in range(max_retries): response=requests.post(image_API,headers=headers,json=payload) if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'): image_path='generated_image.png' with open(image_path,'wb') as f: f.write(response.content) return image_path time.sleep(delay) return None def process_audio(audio_path): if not audio_path: return None try: transcription=query_whisper(audio_path) tamil_text=transcription.get("text","Transcription error") translation=query_translation(tamil_text) translated_text=translation.get("translated_text","Translation error") image_path=query_image(translated_text) return tamil_text,translated_text,image_path except Exception as e: return None,str(e),None iface=gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath",label="Upload Audio"), outputs=[ gr.Textbox(label="Tamil Text"), gr.Textbox(label="English Translation"), gr.Image(label="Generated Image") ], title="Speech-to-Image Generation" ) iface.launch(share=True,server_name="0.0.0.0",server_port=None)