SandraCLV commited on
Commit
7807f29
·
1 Parent(s): 6a3e4b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -6
app.py CHANGED
@@ -5,9 +5,23 @@ import librosa
5
  import datasets
6
  from transformers.pipelines.pt_utils import KeyDataset
7
  from tqdm.auto import tqdm
 
 
 
 
 
8
 
 
9
  image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
10
 
 
 
 
 
 
 
 
 
11
 
12
  def image_to_text(input_image):
13
  # Convertir la imagen a texto
@@ -16,10 +30,76 @@ def image_to_text(input_image):
16
  #texts = transcriber(text_output)
17
  return text_output
18
 
19
- gr.Interface(fn=image_to_text,
20
- title="Image to Text",
21
- inputs=gr.Image(type='pil'),
22
- outputs=[gr.Textbox(label="Output")],
23
- description="Object Recognition using Microsoft BEIT",
24
- article = "Author: <a href=\"https://huggingface.co/rowel\">Rowel Atienza</a>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  ).launch()
 
5
  import datasets
6
  from transformers.pipelines.pt_utils import KeyDataset
7
  from tqdm.auto import tqdm
8
+ import logging
9
+ import time
10
+ import uuid
11
+ import soundfile as sf
12
+ from model import get_pretrained_model, language_to_models
13
 
14
+ #text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py
15
  image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
16
 
17
+ def build_html_output(s: str, style: str = "result_item_success"):
18
+ return f"""
19
+ <div class='result'>
20
+ <div class='result_item {style}'>
21
+ {s}
22
+ </div>
23
+ </div>
24
+ """
25
 
26
  def image_to_text(input_image):
27
  # Convertir la imagen a texto
 
30
  #texts = transcriber(text_output)
31
  return text_output
32
 
33
+ def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
34
+ logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
35
+ sid = int(sid)
36
+ tts = get_pretrained_model(repo_id, speed)
37
+
38
+ start = time.time()
39
+ audio = tts.generate(text, sid=sid)
40
+ end = time.time()
41
+
42
+ if len(audio.samples) == 0:
43
+ raise ValueError(
44
+ "Error in generating audios. Please read previous error messages."
45
+ )
46
+
47
+ duration = len(audio.samples) / audio.sample_rate
48
+
49
+ elapsed_seconds = end - start
50
+ rtf = elapsed_seconds / duration
51
+
52
+ info = f"""
53
+ Wave duration : {duration:.3f} s <br/>
54
+ Processing time: {elapsed_seconds:.3f} s <br/>
55
+ RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
56
+ """
57
+
58
+ logging.info(info)
59
+ logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
60
+
61
+ filename = str(uuid.uuid4())
62
+ filename = f"{filename}.wav"
63
+ sf.write(
64
+ filename,
65
+ audio.samples,
66
+ samplerate=audio.sample_rate,
67
+ subtype="PCM_16",
68
+ )
69
+
70
+ return filename, build_html_output(info)
71
+
72
+ demo = gr.Blocks()
73
+
74
+ with demo:
75
+ language_choices = list(language_to_models.keys())
76
+ inputsImg=gr.Image(type='pil')
77
+ idx=0
78
+ for txt in image_to_text(inputsImg)
79
+ output_txt[idx] = gr.Textbox(label=txt,lines=1,max_lines=1,value=txt,placeholder="Interpretation")
80
+ input_sid = gr.Textbox(
81
+ label="Speaker ID",
82
+ info="Speaker ID",
83
+ lines=1,
84
+ max_lines=1,
85
+ value="0",
86
+ placeholder="Speaker ID. Valid only for mult-speaker model",
87
+ )
88
+ input_speed = gr.Slider(
89
+ minimum=0.1,
90
+ maximum=10,
91
+ value=1,
92
+ step=0.1,
93
+ label="Speed (larger->faster; smaller->slower)",input_sid
94
+ )
95
+ text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],txt,input_sid, input_speed)
96
+ output_audio[idx] = gr.Audio(label="Output")
97
+ output_info[idx] = gr.HTML(label="Info")
98
+ idx=idx+1
99
+ gr.Interface(fn=image_to_text,
100
+ title="Image to Text Interpretation",
101
+ inputs=inputsImg,
102
+ outputs=[output_txt,output_audio,input_sid,input_speed],
103
+ description="image to audio demo",
104
+ article = "",
105
  ).launch()