sk007msd commited on
Commit
faa30e5
·
verified ·
1 Parent(s): f8cccad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -58
app.py CHANGED
@@ -1,95 +1,76 @@
1
  import os
 
2
  import requests
3
  import gradio as gr
4
- import asyncio
5
- import aiohttp
6
  from huggingface_hub import login
7
  from dotenv import load_dotenv
8
 
9
  load_dotenv()
10
  image_API="https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
11
- translation_API="https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-600M"
12
- whisper_API="https://api-inference.huggingface.co/models/openai/whisper-medium"
13
- txt_API="https://api-inference.huggingface.co/models/openai-community/gpt2"
14
- HF_TOKEN=os.getenv("HF_TOKEN")
15
  login(HF_TOKEN)
16
  headers={"Authorization":f"Bearer {HF_TOKEN}"}
17
 
18
- async def query_api(url, payload=None, data=None):
19
- async with aiohttp.ClientSession() as session:
20
- for attempts in range(5):
21
- try:
22
- async with session.post(url, headers=headers, json=payload, data=data) as response:
23
- if response.status == 200:
24
- return await response.json()
25
- print("API Error {response.status}, retrying....")
26
- except Exception as e:
27
- print(f"Error: {e}")
28
- await asyncio.sleep(10)
29
- return None
30
 
31
- async def query_whisper(audio_path):
32
  with open(audio_path,"rb") as audio:
33
  data=audio.read()
34
- return await query_api(whisper_API,data=data)
 
35
 
36
- async def query_translation(text):
37
  max_retries=5
38
  delay=10
39
  payload={"inputs":text,"parameters":{"src_lang":"ta_Taml","tgt_lang":'eng_Latn'}}
40
- response=await query_api(translation_API,json=payload)
41
- if response and isinstance(response,list) and len(response)>0:
42
- return response[0]["translation_text"]
43
- elif response and "translation_text" in resopnse:
44
- return response["translation_text"]
45
- return "Translation Error"
 
 
 
46
 
47
- async def query_image(prompt):
 
 
48
  payload={"inputs":prompt}
49
- response=await query_api(image_API,json=payload)
50
- if response:
51
- image_path='generated_image.png'
52
- with open(image_path,'wb') as f:
53
- f.write(response.content)
54
- return image_path
55
- return None
56
-
57
- async def query_text_generation(prompt):
58
- payload={"inputs":f"give me a short story about {prompt}"}
59
- response=await query_api(txt_API,json=payload)
60
- if response and isinstance(response, list) and len(response) > 0:
61
- return response[0].get("generated_text", "Text Generation Error")
62
- elif response and "generated_text" in response:
63
- return response["generated_text"]
64
  return None
65
 
66
 
67
-
68
-
69
- async def process_audio(audio_path):
70
  if not audio_path:
71
- return None,"Audio not provided",None,None
72
  try:
73
- transcription=await query_whisper(audio_path)
74
  tamil_text=transcription.get("text","Transcription error")
75
-
76
- translation_task=query_translation(tamil_text)
77
- image_task=query_image(translated_text)
78
- story_task=query_text_generation(translated_text)
79
-
80
- translated_text,image_path,story=await asyncio.gather(translation_task,image_task,story_task)
81
- return tamil_text,translated_text,image_path,story
82
  except Exception as e:
83
  return None,str(e),None
84
 
85
  iface=gr.Interface(
86
- fn=lambda x:asyncio.run(process_audio(x)),
87
  inputs=gr.Audio(type="filepath",label="Upload Audio"),
88
  outputs=[
89
  gr.Textbox(label="Tamil Text"),
90
  gr.Textbox(label="English Translation"),
91
- gr.Image(label="Generated Image"),
92
- gr.Textbox(label="Story")
93
  ],
94
  title="Speech-to-Image Generation"
95
  )
 
1
  import os
2
+ import time
3
  import requests
4
  import gradio as gr
 
 
5
  from huggingface_hub import login
6
  from dotenv import load_dotenv
7
 
8
  load_dotenv()
9
  image_API="https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2"
10
+ translation_API="https://api-inference.huggingface.co/models/facebook/nllb-200-distilled-1.3B"
11
+ whisper_API="https://api-inference.huggingface.co/models/openai/whisper-large-v3"
12
+
13
+ HF_TOKEN=os.getenv("HF_Token")
14
  login(HF_TOKEN)
15
  headers={"Authorization":f"Bearer {HF_TOKEN}"}
16
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ def query_whisper(audio_path):
19
  with open(audio_path,"rb") as audio:
20
  data=audio.read()
21
+ response=requests.post(whisper_API,headers=headers,data=data)
22
+ return response.json()
23
 
24
+ def query_translation(text):
25
  max_retries=5
26
  delay=10
27
  payload={"inputs":text,"parameters":{"src_lang":"ta_Taml","tgt_lang":'eng_Latn'}}
28
+ for attempt in range(max_retries):
29
+ response=requests.post(translation_API,headers=headers,json=payload)
30
+ result= response.json()
31
+ if isinstance(result,list) and len(result)>0:
32
+ return {"translated_text":result[0]["translation_text"]}
33
+ elif isinstance(result,dict) and "translation_text" in result:
34
+ return {"translated_text":result["translation_text"]}
35
+ time.sleep(delay)
36
+ return {"error":"Translation API failed","response":result}
37
 
38
+ def query_image(prompt):
39
+ max_retries=5
40
+ delay=10
41
  payload={"inputs":prompt}
42
+ for attempt in range(max_retries):
43
+ response=requests.post(image_API,headers=headers,json=payload)
44
+ if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
45
+ image_path='generated_image.png'
46
+ with open(image_path,'wb') as f:
47
+ f.write(response.content)
48
+ return image_path
49
+ time.sleep(delay)
 
 
 
 
 
 
 
50
  return None
51
 
52
 
53
+ def process_audio(audio_path):
 
 
54
  if not audio_path:
55
+ return None
56
  try:
57
+ transcription=query_whisper(audio_path)
58
  tamil_text=transcription.get("text","Transcription error")
59
+ translation=query_translation(tamil_text)
60
+ translated_text=translation.get("translated_text","Translation error")
61
+ image_path=query_image(translated_text)
62
+ return tamil_text,translated_text,image_path
 
 
 
63
  except Exception as e:
64
  return None,str(e),None
65
 
66
  iface=gr.Interface(
67
+ fn=process_audio,
68
  inputs=gr.Audio(type="filepath",label="Upload Audio"),
69
  outputs=[
70
  gr.Textbox(label="Tamil Text"),
71
  gr.Textbox(label="English Translation"),
72
+ gr.Image(label="Generated Image")
73
+
74
  ],
75
  title="Speech-to-Image Generation"
76
  )