Spaces:
Sleeping
Sleeping
File size: 3,530 Bytes
1a7973f 5eb99ac ec574b7 5eb99ac 97efaa3 7807f29 66b1e3d 7807f29 13fea48 5eb99ac 7807f29 c1a321e 7807f29 fb75a66 7807f29 a00f3e6 227c7dc fb75a66 66b1e3d 7807f29 ec800df aa54951 7807f29 fb2ccae 66b1e3d 7807f29 024bedc 7807f29 86ff586 227c7dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import gradio as gr
from transformers import pipeline,WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import datasets
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
import logging
import time
import uuid
import soundfile as sf
# model.py apache license 2.0 Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
from model import get_pretrained_model, language_to_models
# demo for a input given image transform into text interpretation, and those text put a speech text to be played
#text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py
image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
def build_html_output(s: str, style: str = "result_item_success"):
return f"""
<div class='result'>
<div class='result_item {style}'>
{s}
</div>
</div>
"""
def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
sid = int(sid)
tts = get_pretrained_model(repo_id, speed)
start = time.time()
audio = tts.generate(text, sid=sid)
end = time.time()
if len(audio.samples) == 0:
raise ValueError(
"Error in generating audios. Please read previous error messages."
)
duration = len(audio.samples) / audio.sample_rate
elapsed_seconds = end - start
rtf = elapsed_seconds / duration
info = f"""
Wave duration : {duration:.3f} s <br/>
Processing time: {elapsed_seconds:.3f} s <br/>
RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
"""
logging.info(info)
logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
filename = str(uuid.uuid4())
filename = f"{filename}.wav"
sf.write(
filename,
audio.samples,
samplerate=audio.sample_rate,
subtype="PCM_16",
)
return filename, build_html_output(info)
demo = gr.Blocks()
with demo:
language_choices = list(language_to_models.keys())
inputsImg=gr.Image(type='pil')
idx=0
text_output = image_to_text_model(inputsImg)[0]['label']
for txt in text_output:
output_txt[idx] = gr.Textbox(label=text_output,lines=1,max_lines=1,value=text_output,placeholder="Interpretation")
input_sid = gr.Textbox(
label="Speaker ID",
info="Speaker ID",
lines=1,
max_lines=1,
value="0",
placeholder="Speaker ID. Valid only for mult-speaker model")
input_speed = gr.Slider(
minimum=0.1,
maximum=10,
value=1,
step=0.1,
label="Speed (larger->faster; smaller->slower)")
text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],text_output,input_sid,input_speed)
output_audio[idx] = gr.Audio(label="Output")
output_info[idx] = gr.HTML(label="Info")
idx=idx+1
demo=gr.Interface(fn=text_to_speech,
title="Image to Text Interpretation",
inputs=inputsImg,
outputs=[output_txt,output_audio,input_sid,input_speed],
description="image to audio demo",
article = ""
)
demo.launch() |