SandraCLV commited on
Commit
7d7172f
1 Parent(s): 1a4e740

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -51
app.py CHANGED
@@ -3,27 +3,15 @@ from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokeni
3
  import librosa
4
  import numpy as np
5
  import torch
6
-
7
- # from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration, VisionEncoderDecoderModel
8
  import open_clip
9
 
10
  #CONSTANTS
11
- speaker_embeddings = {
12
- "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
13
- "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
14
- "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
15
- "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
16
- }
17
  # Carga el modelo de clasificaci贸n de imagen a texto
18
  blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
19
  blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
20
 
21
- # # Carga el modelo de clasificaci贸n de tetxo a audio speech
22
- checkpoint = "microsoft/speecht5_tts"
23
- processor = SpeechT5Processor.from_pretrained(checkpoint)
24
- model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
25
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
26
-
27
  device = "cuda" if torch.cuda.is_available() else "cpu"
28
  blip_model_large.to(device)
29
 
@@ -52,48 +40,13 @@ def generate_caption_coca(model, transform, image):
52
  return open_clip.decode(generated[0].detach()).split("<end_of_text>")[0].replace("<start_of_text>", "")
53
 
54
 
55
- def generate_captions(image):
56
 
57
  caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
58
  print('generate_captions>>>'+caption_blip_large)
59
  return caption_blip_large,text_to_speech(caption_blip_large,"Surprise Me!")
60
 
61
  #####END IMAGE MODEL TO TEXT
62
-
63
- ### TEXT TO AUDIO SPEECH MODEL 2
64
- # Define la funci贸n que convierte texto en voz
65
- def text_to_speech(text,speaker):
66
- # Genera el audio utilizando el modelo
67
- if len(text.strip()) == 0:
68
- return (16000, np.zeros(0).astype(np.int16))
69
- inputs = processor(text=text, return_tensors="pt")
70
-
71
- # limit input length
72
- input_ids = inputs["input_ids"]
73
- input_ids = input_ids[..., :model.config.max_text_positions]
74
-
75
- if speaker == "Surprise Me!":
76
- # load one of the provided speaker embeddings at random
77
- idx = np.random.randint(len(speaker_embeddings))
78
- key = list(speaker_embeddings.keys())[idx]
79
- speaker_embedding = np.load(speaker_embeddings[key])
80
-
81
- # randomly shuffle the elements
82
- np.random.shuffle(speaker_embedding)
83
-
84
- # randomly flip half the values
85
- x = (np.random.rand(512) >= 0.5) * 1.0
86
- x[x == 0] = -1.0
87
- speaker_embedding *= x
88
-
89
- #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
90
- speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
91
-
92
- speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
93
-
94
- speech = (speech.numpy() * 32767).astype(np.int16)
95
- return (16000, speech)
96
- ### END TEXT TO AUDIO SPEECH MODEL 2
97
 
98
  # Define la interfaz de usuario utilizando Gradio entradas y salidas
99
  inputsImg = [
@@ -106,7 +59,7 @@ title = "Clasificaci贸n de imagen a texto y conversi贸n de texto a voz"
106
  description = "Carga una imagen y obt茅n una descripci贸n de texto de lo que contiene la imagen, as铆 como un archivo de audio de la trasncripcion de la imagen en audio descrito."
107
  examples = []
108
 
109
- interface = gr.Interface(fn=generate_captions,
110
  inputs=inputsImg,
111
  outputs=outputs,
112
  examples=examples,
 
3
  import librosa
4
  import numpy as np
5
  import torch
6
+ import audio_model
 
7
  import open_clip
8
 
9
  #CONSTANTS
10
+
 
 
 
 
 
11
  # Carga el modelo de clasificaci贸n de imagen a texto
12
  blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
13
  blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
14
 
 
 
 
 
 
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  blip_model_large.to(device)
17
 
 
40
  return open_clip.decode(generated[0].detach()).split("<end_of_text>")[0].replace("<start_of_text>", "")
41
 
42
 
43
+ def generate_captions_speech(image):
44
 
45
  caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
46
  print('generate_captions>>>'+caption_blip_large)
47
  return caption_blip_large,text_to_speech(caption_blip_large,"Surprise Me!")
48
 
49
  #####END IMAGE MODEL TO TEXT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  # Define la interfaz de usuario utilizando Gradio entradas y salidas
52
  inputsImg = [
 
59
  description = "Carga una imagen y obt茅n una descripci贸n de texto de lo que contiene la imagen, as铆 como un archivo de audio de la trasncripcion de la imagen en audio descrito."
60
  examples = []
61
 
62
+ interface = gr.Interface(fn=generate_captions_speech,
63
  inputs=inputsImg,
64
  outputs=outputs,
65
  examples=examples,