SandraCLV commited on
Commit
66b1e3d
·
1 Parent(s): 758ce80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -9,6 +9,7 @@ import logging
9
  import time
10
  import uuid
11
  import soundfile as sf
 
12
  from model import get_pretrained_model, language_to_models
13
  # demo for a input given image transform into text interpretation, and those text put a speech text to be played
14
 
@@ -23,14 +24,6 @@ def build_html_output(s: str, style: str = "result_item_success"):
23
  </div>
24
  </div>
25
  """
26
-
27
- def image_to_text(input_image):
28
- # Convertir la imagen a texto
29
- text_output = image_to_text_model(input_image)[0]['label']
30
- print(text_output)
31
- #texts = transcriber(text_output)
32
- return text_output
33
-
34
  def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
35
  logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
36
  sid = int(sid)
@@ -76,8 +69,9 @@ with demo:
76
  language_choices = list(language_to_models.keys())
77
  inputsImg=gr.Image(type='pil')
78
  idx=0
79
- for txt in image_to_text(inputsImg):
80
- output_txt[idx] = gr.Textbox(label=txt,lines=1,max_lines=1,value=txt,placeholder="Interpretation")
 
81
  input_sid = gr.Textbox(
82
  label="Speaker ID",
83
  info="Speaker ID",
@@ -91,7 +85,7 @@ with demo:
91
  value=1,
92
  step=0.1,
93
  label="Speed (larger->faster; smaller->slower)")
94
- text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],txt,input_sid,input_speed)
95
  output_audio[idx] = gr.Audio(label="Output")
96
  output_info[idx] = gr.HTML(label="Info")
97
  idx=idx+1
 
9
  import time
10
  import uuid
11
  import soundfile as sf
12
+ # model.py apache license 2.0 Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
13
  from model import get_pretrained_model, language_to_models
14
  # demo for a input given image transform into text interpretation, and those text put a speech text to be played
15
 
 
24
  </div>
25
  </div>
26
  """
 
 
 
 
 
 
 
 
27
  def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
28
  logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
29
  sid = int(sid)
 
69
  language_choices = list(language_to_models.keys())
70
  inputsImg=gr.Image(type='pil')
71
  idx=0
72
+ for txt in inputsImg:
73
+ text_output = image_to_text_model(txt)[0]['label']
74
+ output_txt[idx] = gr.Textbox(label=text_output,lines=1,max_lines=1,value=text_output,placeholder="Interpretation")
75
  input_sid = gr.Textbox(
76
  label="Speaker ID",
77
  info="Speaker ID",
 
85
  value=1,
86
  step=0.1,
87
  label="Speed (larger->faster; smaller->slower)")
88
+ text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],text_output,input_sid,input_speed)
89
  output_audio[idx] = gr.Audio(label="Output")
90
  output_info[idx] = gr.HTML(label="Info")
91
  idx=idx+1