Spaces:

SandraCLV
/

injectModel1intoModel2

Sleeping

File size: 1,290 Bytes

5c5a283
9e26359
5c5a283
fac92f6
d6da658
 
 
 
 
 
 
 
 
 
 
 
9e26359
 
fac92f6
9e26359
 
fac92f6
9e26359
 
 
d6da658
fac92f6
9e26359
 
fac92f6
9e26359
fac92f6
d6da658
9e26359
 
 
 
 
 
 
 
 
5c5a283
9e26359
d6da658

import gradio as gr
from transformers import pipeline
import torch

#Definir 2 modelos uno de imagen a texto y otro de texto a audio que inyecta
# el resultado del primero modelo(texto generado) en la entrada del 2º modelo
# texto to audio

def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x.convert("RGB") for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs
    
# Cargar el modelo que convierte imagen a texto
image_to_text_model = pipeline("image-classification")

# Cargar el modelo que genera audio a partir de texto
text_to_audio_model = pipeline("text-to-speech")

# Función para la interfaz de Gradio
def image_to_audio(input_image):
    # Convertir la imagen a texto
    text_output = transform(image_to_text_model(input_image)[0]['label'])

    # Generar audio a partir del texto
    audio_output = text_to_audio_model(text_output)[0]['audio']

    return audio_output

    
# Interfaz Gradio
iface = gr.Interface(
    fn=image_to_audio,
    inputs=gr.Image(),
    outputs="audio",
    live=True,
    interpretation="default",
    capture_session=True
)

# Ejecutar la interfaz
iface.launch()