Spaces:
Sleeping
Sleeping
File size: 3,551 Bytes
1a7973f 5eb99ac ec574b7 5eb99ac 97efaa3 7807f29 c695aa7 66b1e3d 7807f29 13fea48 5eb99ac 7807f29 c1a321e 7807f29 c695aa7 7807f29 2bf66ba 7807f29 c695aa7 58690d0 c695aa7 66b1e3d 7807f29 ec800df aa54951 7807f29 fb2ccae 66b1e3d 7807f29 024bedc 7807f29 86ff586 227c7dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
from transformers import pipeline,WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import datasets
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm
import logging
import time
import uuid
import soundfile as sf
from PIL import Image
# model.py apache license 2.0 Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
from model import get_pretrained_model, language_to_models
# demo for a input given image transform into text interpretation, and those text put a speech text to be played
#text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py
image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
def build_html_output(s: str, style: str = "result_item_success"):
return f"""
<div class='result'>
<div class='result_item {style}'>
{s}
</div>
</div>
"""
def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
sid = int(sid)
tts = get_pretrained_model(repo_id, speed)
start = time.time()
audio = tts.generate(text, sid=sid)
end = time.time()
if len(audio.samples) == 0:
raise ValueError(
"Error in generating audios. Please read previous error messages."
)
duration = len(audio.samples) / audio.sample_rate
elapsed_seconds = end - start
rtf = elapsed_seconds / duration
info = f"""
Wave duration : {duration:.3f} s <br/>
Processing time: {elapsed_seconds:.3f} s <br/>
RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
"""
logging.info(info)
logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
filename = str(uuid.uuid4())
filename = f"{filename}.wav"
sf.write(
filename,
audio.samples,
samplerate=audio.sample_rate,
subtype="PCM_16",
)
return filename, build_html_output(info)
with gr.Blocks() as demo:
language_choices = list(language_to_models.keys())
inputsImg=gr.Image()
idx=0
text_output = image_to_text_model(inputsImg)[0]
print(text_output)
for txt in t ext_output:
output_txt[idx] = gr.Textbox(label=text_output,lines=1,max_lines=1,value=text_output,placeholder="Interpretation")
input_sid = gr.Textbox(
label="Speaker ID",
info="Speaker ID",
lines=1,
max_lines=1,
value="0",
placeholder="Speaker ID. Valid only for mult-speaker model")
input_speed = gr.Slider(
minimum=0.1,
maximum=10,
value=1,
step=0.1,
label="Speed (larger->faster; smaller->slower)")
text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],text_output,input_sid,input_speed)
output_audio[idx] = gr.Audio(label="Output")
output_info[idx] = gr.HTML(label="Info")
idx=idx+1
demo=gr.Interface(fn=text_to_speech,
title="Image to Text Interpretation",
inputs=inputsImg,
outputs=[output_txt,output_audio,input_sid,input_speed],
description="image to audio demo",
article = ""
)
demo.launch() |