Spaces:
Sleeping
Sleeping
File size: 3,543 Bytes
1a7973f 5eb99ac ec574b7 5eb99ac 97efaa3 7807f29 13fea48 5eb99ac 7807f29 c1a321e 7807f29 fac92f6 460a57d 97efaa3 460a57d d6da658 7807f29 d9811eb 7807f29 ec800df 7807f29 fb2ccae ec800df 7807f29 d9811eb 7807f29 c1a321e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import gradio as gr
from transformers import pipeline,WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import datasets
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
import logging
import time
import uuid
import soundfile as sf
from model import get_pretrained_model, language_to_models
# demo for a input given image transform into text interpretation, and those text put a speech text to be played
#text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py
image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
def build_html_output(s: str, style: str = "result_item_success"):
return f"""
<div class='result'>
<div class='result_item {style}'>
{s}
</div>
</div>
"""
def image_to_text(input_image):
# Convertir la imagen a texto
text_output = image_to_text_model(input_image)[0]['label']
print(text_output)
#texts = transcriber(text_output)
return text_output
def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
sid = int(sid)
tts = get_pretrained_model(repo_id, speed)
start = time.time()
audio = tts.generate(text, sid=sid)
end = time.time()
if len(audio.samples) == 0:
raise ValueError(
"Error in generating audios. Please read previous error messages."
)
duration = len(audio.samples) / audio.sample_rate
elapsed_seconds = end - start
rtf = elapsed_seconds / duration
info = f"""
Wave duration : {duration:.3f} s <br/>
Processing time: {elapsed_seconds:.3f} s <br/>
RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
"""
logging.info(info)
logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
filename = str(uuid.uuid4())
filename = f"{filename}.wav"
sf.write(
filename,
audio.samples,
samplerate=audio.sample_rate,
subtype="PCM_16",
)
return filename, build_html_output(info)
demo = gr.Blocks()
with demo:
language_choices = list(language_to_models.keys())
inputsImg=gr.Image(type='pil')
idx=0
for txt in image_to_text(inputsImg):
output_txt[idx] = gr.Textbox(label=txt,lines=1,max_lines=1,value=txt,placeholder="Interpretation")
input_sid = gr.Textbox(
label="Speaker ID",
info="Speaker ID",
lines=1,
max_lines=1,
value="0",
placeholder="Speaker ID. Valid only for mult-speaker model")
input_speed = gr.Slider(
minimum=0.1,
maximum=10,
value=1,
step=0.1,
label="Speed (larger->faster; smaller->slower)")
text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],txt,input_sid,input_speed)
output_audio[idx] = gr.Audio(label="Output")
output_info[idx] = gr.HTML(label="Info")
idx=idx+1
gr.Interface(fn=text_to_speech,
title="Image to Text Interpretation",
inputs=inputsImg,
outputs=[output_txt,output_audio,input_sid,input_speed],
description="image to audio demo",
article = "",
).launch() |