SandraCLV's picture
Update app.py
cce1817
raw
history blame
2.69 kB
import gradio as gr
from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer,SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import librosa
import numpy as np
import torch
import image_text_model
import audio_model
import open_clip
#CONSTANTS
# Carga el modelo de clasificaci贸n de imagen a texto
blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
device = "cuda" if torch.cuda.is_available() else "cpu"
blip_model_large.to(device)
##### IMAGE MODEL TO TEXT, MODEL 1
def generate_caption(processor, model, image, tokenizer=None, use_float_16=False):
inputs = processor(images=image, return_tensors="pt").to(device)
if use_float_16:
inputs = inputs.to(torch.float16)
generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
if tokenizer is not None:
generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
else:
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return generated_caption
def generate_caption_coca(model, transform, image):
im = transform(image).unsqueeze(0).to(device)
with torch.no_grad(), torch.cuda.amp.autocast():
generated = model.generate(im, seq_len=20)
return open_clip.decode(generated[0].detach()).split("<end_of_text>")[0].replace("<start_of_text>", "")
def generate_captions_speech(image):
caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
print('generate_captions>>>'+caption_blip_large)
return caption_blip_large,text_to_speech(caption_blip_large,"Surprise Me!")
#####END IMAGE MODEL TO TEXT
# Define la interfaz de usuario utilizando Gradio entradas y salidas
inputsImg = [
gr.Image(type="pil", label="Imagen"),
]
#Salidas es lo que genera de tetxo y el audio
outputs = [ gr.Textbox(label="Caption generated by BLIP-large"),gr.Audio(type="numpy",label='Transcripcion')]
title = "Clasificaci贸n de imagen a texto y conversi贸n de texto a voz"
description = "Carga una imagen y obt茅n una descripci贸n de texto de lo que contiene la imagen, as铆 como un archivo de audio de la trasncripcion de la imagen en audio descrito."
examples = []
interface = gr.Interface(fn=generate_captions_speech,
inputs=inputsImg,
outputs=outputs,
examples=examples,
title=title,
description=description)
interface.launch()