ALVHB95 commited on
Commit
865160b
·
1 Parent(s): b6be7ba

update langchain

Browse files
Files changed (1) hide show
  1. app.py +225 -143
app.py CHANGED
@@ -1,197 +1,258 @@
1
  # app.py
2
 
 
 
 
 
 
3
  import gradio as gr
4
-
5
  import torch
6
  import theme
7
  theme = theme.Theme()
 
8
  from huggingface_hub import from_pretrained_keras
9
  from tensorflow.keras.applications import EfficientNetB0
10
-
11
  import tensorflow as tf
12
  from tensorflow import keras
 
13
  from PIL import Image
14
- from pydantic.v1 import BaseModel, Field
15
  import shutil
16
- import tenacity
17
 
18
- #langchain
19
- from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
 
 
 
20
  from langchain.embeddings import HuggingFaceEmbeddings
21
  from langchain.prompts import PromptTemplate
22
- from langchain.chains import RetrievalQA
23
- from langchain.prompts import ChatPromptTemplate
24
  from langchain.schema import StrOutputParser
25
  from langchain.schema.runnable import Runnable
26
  from langchain.schema.runnable.config import RunnableConfig
27
- from langchain.chains import (
28
- LLMChain, ConversationalRetrievalChain)
29
- from langchain.vectorstores import Chroma
30
- from langchain.memory import ConversationBufferMemory
31
- from langchain.chains import LLMChain
32
  from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
33
- from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
34
  from langchain.output_parsers import PydanticOutputParser
35
  from langchain_community.llms import HuggingFaceHub
36
  from langchain_community.document_loaders import WebBaseLoader
 
 
37
 
38
- from fake_useragent import UserAgent
39
-
40
- custom_title = "<span style='color: rgb(243, 239, 224);'>Green Greta</span>"
41
-
42
 
43
- # Cell 1: Image Classification Model
44
 
 
 
 
 
 
 
45
  model1 = from_pretrained_keras("rocioadlc/efficientnetB0_trash")
46
 
47
- # Define class labels
48
  class_labels = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
49
 
50
- # Function to predict image label and score
51
- def predict_image(input):
52
- # Resize the image to the size expected by the model and convert to numpy array
53
- image_array = tf.keras.preprocessing.image.img_to_array(input.resize((244, 224))) # Cambiar el orden de las dimensiones
54
- # Normalize the image
 
 
 
 
 
 
55
  image_array = tf.keras.applications.efficientnet.preprocess_input(image_array)
56
- # Expand the dimensions to create a batch
57
  image_array = tf.expand_dims(image_array, 0)
58
- # Predict using the model
59
  predictions = model1.predict(image_array)
 
 
60
  category_scores = {}
61
  for i, class_label in enumerate(class_labels):
62
  category_scores[class_label] = predictions[0][i].item()
63
 
64
  return category_scores
65
 
66
-
67
  image_gradio_app = gr.Interface(
68
  fn=predict_image,
69
  inputs=gr.Image(label="Image", sources=['upload', 'webcam'], type="pil"),
70
  outputs=[gr.Label(label="Result")],
71
- title=custom_title,
72
  theme=theme
73
  )
74
 
75
- # Cell 2: ChatBot Model
76
 
77
- # Generate a random user agent
 
 
 
 
 
78
  user_agent = UserAgent().random
79
  header_template = {"User-Agent": user_agent}
80
 
81
- # Create the loader with the headers
82
- loader = WebBaseLoader(
83
- web_paths=[
84
- "https://www.epa.gov/recycle/frequent-questions-recycling",
85
- "https://www.whitehorsedc.gov.uk/vale-of-white-horse-district-council/recycling-rubbish-and-waste/lets-get-real-about-recycling/",
86
- "https://www.teimas.com/blog/13-preguntas-y-respuestas-sobre-la-ley-de-residuos-07-2022",
87
- "https://www.molok.com/es/blog/gestion-de-residuos-solidos-urbanos-rsu-10-dudas-comunes",
88
- "https://espanol.epa.gov/espanol/el-reciclaje#valelapena",
89
- "https://espanol.epa.gov/espanol/preguntas-frecuentes-sobre-reciclado-de-plastico-y-elaboracion-de-abono-vegetal",
90
- "https://espanol.epa.gov/espanol/consejo-del-dia-como-reciclo-mis",
91
- "https://espanol.epa.gov/espanol/recursos-para-reciclar-dispositivos-electronicos",
92
- "https://www.epa.gov/recycle/electronics-donation-and-recycling",
93
- "https://reducereutilizarecicla.org/que-es-el-reciclaje/",
94
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/",
95
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-amarillo/",
96
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-azul/",
97
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-verde/",
98
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-marron-organico/",
99
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-gris-restos/",
100
- "https://reducereutilizarecicla.org/contenedores-de-reciclaje/punto-limpio/",
101
- "https://reducereutilizarecicla.org/donde-tirar-auriculares/",
102
- "https://reducereutilizarecicla.org/donde-tirar-sartenes/",
103
- "https://reducereutilizarecicla.org/donde-tirar-aceite-usado/",
104
- "https://reducereutilizarecicla.org/como-se-reciclan-los-envases-tipo-brik/",
105
- "https://reducereutilizarecicla.org/los-envases-del-verano/",
106
- "https://reducereutilizarecicla.org/donde-tirar-radiografias/",
107
- "https://reducereutilizarecicla.org/envases-ecologicos/",
108
- "https://reducereutilizarecicla.org/donde-tirar-los-restos-de-pintura/",
109
- "https://reducereutilizarecicla.org/valorizacion-de-residuos/",
110
- "https://reducereutilizarecicla.org/como-reciclar-pilas/",
111
- "https://reducereutilizarecicla.org/como-reciclar-capsulas-de-cafe/",
112
- "https://reducereutilizarecicla.org/reciclando-cd/",
113
- "https://reducereutilizarecicla.org/donde-tirar-neumaticos/",
114
- "https://reducereutilizarecicla.org/como-reciclar-una-canasta-de-mimbre/",
115
- "https://reducereutilizarecicla.org/como-funciona-el-contenedor-amarillo/",
116
- "https://reducereutilizarecicla.org/donde-se-tiran-los-vapers/",
117
- "https://reducereutilizarecicla.org/cuanto-tarda-una-bolsa-biodegradable-en-degradarse/",
118
- "https://reducereutilizarecicla.org/donde-se-reciclan-los-juguetes/",
119
- "https://reducereutilizarecicla.org/objetos-que-se-pueden-reutilizar/",
120
- "https://reducereutilizarecicla.org/la-parafina-se-puede-reutilizar/",
121
- "https://reducereutilizarecicla.org/planta-de-reciclaje-de-papel/",
122
- "https://reducereutilizarecicla.org/como-saber-si-un-envase-es-reciclable/",
123
- "https://reducereutilizarecicla.org/reutilizar-vasos-de-vela/",
124
- "https://reducereutilizarecicla.org/bolsas-frio-calor/",
125
- "https://reducereutilizarecicla.org/reciclar-y-reutilizar-materiales-de-construccion/",
126
- "https://reducereutilizarecicla.org/que-es-exactamente-el-pet/",
127
- "https://reducereutilizarecicla.org/tipos-de-reciclaje/",
128
- "https://reducereutilizarecicla.org/que-hacer-con-palets-reciclados/",
129
- "https://reducereutilizarecicla.org/vertederos-controlados/",
130
- "https://reducereutilizarecicla.org/donde-tirar-escombros/",
131
- "https://reducereutilizarecicla.org/como-reciclar-los-residuos-de-ps-poliestireno/",
132
- "https://reducereutilizarecicla.org/tirar-la-basura-sin-bolsas/",
133
- "https://reducereutilizarecicla.org/tirar-el-palo-de-la-fregona/",
134
- "https://reducereutilizarecicla.org/la-mejor-manera-de-reciclar-una-pala-de-padel/",
135
- "https://reducereutilizarecicla.org/sabes-donde-tirar-las-llantas-viejas-de-un-coche/",
136
- "https://reducereutilizarecicla.org/sabes-donde-tirar-el-arbol-de-navidad/",
137
- "https://reducereutilizarecicla.org/clavos-tornillos-herramientas-donde-tirar-hierro/",
138
- "https://reducereutilizarecicla.org/donde-tirar-un-secador-de-pelo-contenedor-o-punto-limpio/",
139
- "https://reducereutilizarecicla.org/donde-tirar-electrodomesticos/",
140
- "https://reducereutilizarecicla.org/donde-puedo-tirar-ramas-de-arboles/",
141
- "https://reducereutilizarecicla.org/donde-tirar-escombros/",
142
- "https://reducereutilizarecicla.org/donde-se-tira-el-muerdago-quemado/",
143
- "https://reducereutilizarecicla.org/sandalias-caucho-reciclado-neumaticos/",
144
- "https://reducereutilizarecicla.org/ideas-para-reciclar-aspas-de-ventilador-de-techo/",
145
- "https://reducereutilizarecicla.org/reciclar-sacos-dormir/",
146
- "https://reducereutilizarecicla.org/reciclar-sillas-playa/",
147
- "https://reducereutilizarecicla.org/donde-tirar-antipolillas/",
148
- "https://reducereutilizarecicla.org/que-hacer-con-los-juguetes-viejos/",
149
- "https://reducereutilizarecicla.org/como-utilizar-las-mascarillas-y-el-gel-hidroalcoholico-en-la-playa/",
150
- "https://reducereutilizarecicla.org/ideas-para-reciclar-un-ventilador-de-pie/",
151
- "https://reducereutilizarecicla.org/donde-tirar-gasoil/",
152
- "https://reducereutilizarecicla.org/donde-puedo-tirar-basura-electronica/",
153
- "https://reducereutilizarecicla.org/donde-tirar-agujas/",
154
- "https://reducereutilizarecicla.org/donde-tirar-residuos-peligrosos/",
155
- "https://reducereutilizarecicla.org/donde-tirar-los-cables/",
156
- "https://reducereutilizarecicla.org/donde-tirar-bicicletas/",
157
- "https://reducereutilizarecicla.org/donde-tirar-maletas/",
158
- "https://reducereutilizarecicla.org/como-reciclar-una-pantalla/",
159
- "https://reducereutilizarecicla.org/donde-tirar-ropa-usada/"
160
- ],
161
- header_template=header_template
 
 
 
 
162
  )
163
-
164
- data=loader.load()
165
-
166
- # split documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  text_splitter = RecursiveCharacterTextSplitter(
168
  chunk_size=1024,
169
  chunk_overlap=150,
170
  length_function=len
171
  )
172
- docs = text_splitter.split_documents(data)
173
- # define embedding
 
174
  embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-small')
175
- # create vector database from data
 
176
  persist_directory = 'docs/chroma/'
 
177
 
178
- # Remove old database files if any
179
- shutil.rmtree(persist_directory, ignore_errors=True)
180
  vectordb = Chroma.from_documents(
181
  documents=docs,
182
  embedding=embeddings,
183
  persist_directory=persist_directory
184
  )
185
- # define retriever
186
- retriever = vectordb.as_retriever(search_kwargs={"k": 2}, search_type="mmr")
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  class FinalAnswer(BaseModel):
189
  question: str = Field()
190
  answer: str = Field()
191
 
192
- # Assuming you have a parser for the FinalAnswer class
193
  parser = PydanticOutputParser(pydantic_object=FinalAnswer)
194
 
 
195
  template = """
196
  Your name is Greta and you are a recycling chatbot with the objective to anwer questions from user in English or Spanish /
197
  Has sido diseñado y creado por el Grupo 1 del Máster en Data Science & Big Data de la promoción 2023/2024 de la Universidad Complutense de Madrid. Este grupo está fromado por Rocío, María Guillermo, Alejandra, Paloma y Álvaro /
@@ -205,14 +266,16 @@ User: {question}
205
  {format_instructions}
206
  """
207
 
208
- # Create the chat prompt templates
209
  sys_prompt = SystemMessagePromptTemplate.from_template(template)
210
  qa_prompt = ChatPromptTemplate(
211
  messages=[
212
  sys_prompt,
213
- HumanMessagePromptTemplate.from_template("{question}")],
 
214
  partial_variables={"format_instructions": parser.get_format_instructions()}
215
  )
 
 
216
  llm = HuggingFaceHub(
217
  repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
218
  task="text-generation",
@@ -224,42 +287,55 @@ llm = HuggingFaceHub(
224
  },
225
  )
226
 
 
227
  qa_chain = ConversationalRetrievalChain.from_llm(
228
- llm = llm,
229
- memory = ConversationBufferMemory(llm=llm, memory_key="chat_history", input_key='question', output_key='output'),
230
- retriever = retriever,
231
- verbose = True,
 
 
 
 
 
232
  combine_docs_chain_kwargs={'prompt': qa_prompt},
233
- get_chat_history = lambda h : h,
234
- rephrase_question = False,
235
- output_key = 'output',
236
  )
237
 
238
- def chat_interface(question,history):
 
 
 
 
 
239
  result = qa_chain.invoke({'question': question})
240
  output_string = result['output']
241
 
242
- # Find the index of the last occurrence of "answer": in the string
243
  answer_index = output_string.rfind('"answer":')
244
-
245
- # Extract the substring starting from the "answer": index
246
  answer_part = output_string[answer_index + len('"answer":'):].strip()
247
 
248
  # Find the next occurrence of a double quote to get the start of the answer value
249
  quote_index = answer_part.find('"')
250
-
251
- # Extract the answer value between double quotes
252
  answer_value = answer_part[quote_index + 1:answer_part.find('"', quote_index + 1)]
253
 
254
  return answer_value
255
 
256
 
 
257
  chatbot_gradio_app = gr.ChatInterface(
258
  fn=chat_interface,
259
- title=custom_title
260
  )
261
 
262
- # Banner tab
 
 
 
 
 
263
  banner_tab_content = """
264
  <div style="background-color: #d3e3c3; text-align: center; padding: 20px; display: flex; flex-direction: column; align-items: center;">
265
  <img src="https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/resolve/main/front_4.jpg" alt="Banner Image" style="width: 50%; max-width: 500px; margin: 0 auto;">
@@ -283,12 +359,18 @@ banner_tab_content = """
283
  """
284
  banner_tab = gr.Markdown(banner_tab_content)
285
 
286
- # Combine interfaces into a single app
 
 
 
 
 
287
  app = gr.TabbedInterface(
288
  [banner_tab, image_gradio_app, chatbot_gradio_app],
289
  tab_names=["Welcome to Green Greta", "Green Greta Image Classification", "Green Greta Chat"],
290
  theme=theme
291
  )
292
 
 
293
  app.queue()
294
- app.launch()
 
1
  # app.py
2
 
3
+ """
4
+ =========================================================
5
+ 1) IMPORTS & DEPENDENCIES
6
+ =========================================================
7
+ """
8
  import gradio as gr
 
9
  import torch
10
  import theme
11
  theme = theme.Theme()
12
+
13
  from huggingface_hub import from_pretrained_keras
14
  from tensorflow.keras.applications import EfficientNetB0
 
15
  import tensorflow as tf
16
  from tensorflow import keras
17
+
18
  from PIL import Image
 
19
  import shutil
 
20
 
21
+ import tenacity # for retrying failed requests
22
+ from fake_useragent import UserAgent
23
+
24
+ # LangChain
25
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
26
  from langchain.embeddings import HuggingFaceEmbeddings
27
  from langchain.prompts import PromptTemplate
 
 
28
  from langchain.schema import StrOutputParser
29
  from langchain.schema.runnable import Runnable
30
  from langchain.schema.runnable.config import RunnableConfig
31
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain, LLMChain
 
 
 
 
32
  from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
33
+ from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
34
  from langchain.output_parsers import PydanticOutputParser
35
  from langchain_community.llms import HuggingFaceHub
36
  from langchain_community.document_loaders import WebBaseLoader
37
+ from langchain.vectorstores import Chroma
38
+ from langchain.memory import ConversationBufferMemory
39
 
40
+ from pydantic.v1 import BaseModel, Field
 
 
 
41
 
 
42
 
43
+ """
44
+ =========================================================
45
+ 2) IMAGE CLASSIFICATION MODEL SETUP
46
+ =========================================================
47
+ """
48
+ # Load a Keras model from HuggingFace Hub
49
  model1 = from_pretrained_keras("rocioadlc/efficientnetB0_trash")
50
 
51
+ # Define class labels for the trash classification
52
  class_labels = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
53
 
54
+ def predict_image(input_image):
55
+ """
56
+ Resize the user-uploaded image and preprocess it so that it can be fed
57
+ into the EfficientNetB0 model. The model then returns a dictionary of
58
+ class probabilities.
59
+ """
60
+ # Resize the image (note the target dimensions)
61
+ image_array = tf.keras.preprocessing.image.img_to_array(
62
+ input_image.resize((244, 224))
63
+ )
64
+ # Normalize/prescale the image for EfficientNet
65
  image_array = tf.keras.applications.efficientnet.preprocess_input(image_array)
66
+ # Expand the dimensions to create a batch of size 1
67
  image_array = tf.expand_dims(image_array, 0)
68
+ # Get predictions
69
  predictions = model1.predict(image_array)
70
+
71
+ # Convert predictions into a dictionary {class_label: score}
72
  category_scores = {}
73
  for i, class_label in enumerate(class_labels):
74
  category_scores[class_label] = predictions[0][i].item()
75
 
76
  return category_scores
77
 
78
+ # Gradio interface for image classification
79
  image_gradio_app = gr.Interface(
80
  fn=predict_image,
81
  inputs=gr.Image(label="Image", sources=['upload', 'webcam'], type="pil"),
82
  outputs=[gr.Label(label="Result")],
83
+ title="<span style='color: rgb(243, 239, 224);'>Green Greta</span>",
84
  theme=theme
85
  )
86
 
 
87
 
88
+ """
89
+ =========================================================
90
+ 3) CHATBOT MODEL SETUP
91
+ =========================================================
92
+ """
93
+ # 3.1) Define user agent to avoid blocking, etc.
94
  user_agent = UserAgent().random
95
  header_template = {"User-Agent": user_agent}
96
 
97
+ # 3.2) List of URLs to load for retrieval
98
+ URLS = [
99
+ "https://www.epa.gov/recycle/frequent-questions-recycling",
100
+ "https://www.whitehorsedc.gov.uk/vale-of-white-horse-district-council/recycling-rubbish-and-waste/lets-get-real-about-recycling/",
101
+ "https://www.teimas.com/blog/13-preguntas-y-respuestas-sobre-la-ley-de-residuos-07-2022",
102
+ "https://www.molok.com/es/blog/gestion-de-residuos-solidos-urbanos-rsu-10-dudas-comunes",
103
+ "https://espanol.epa.gov/espanol/el-reciclaje#valelapena",
104
+ "https://espanol.epa.gov/espanol/preguntas-frecuentes-sobre-reciclado-de-plastico-y-elaboracion-de-abono-vegetal",
105
+ "https://espanol.epa.gov/espanol/consejo-del-dia-como-reciclo-mis",
106
+ "https://espanol.epa.gov/espanol/recursos-para-reciclar-dispositivos-electronicos",
107
+ "https://www.epa.gov/recycle/electronics-donation-and-recycling",
108
+ "https://reducereutilizarecicla.org/que-es-el-reciclaje/",
109
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/",
110
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-amarillo/",
111
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-azul/",
112
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-verde/",
113
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-marron-organico/",
114
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-gris-restos/",
115
+ "https://reducereutilizarecicla.org/contenedores-de-reciclaje/punto-limpio/",
116
+ "https://reducereutilizarecicla.org/donde-tirar-auriculares/",
117
+ "https://reducereutilizarecicla.org/donde-tirar-sartenes/",
118
+ "https://reducereutilizarecicla.org/donde-tirar-aceite-usado/",
119
+ "https://reducereutilizarecicla.org/como-se-reciclan-los-envases-tipo-brik/",
120
+ "https://reducereutilizarecicla.org/los-envases-del-verano/",
121
+ "https://reducereutilizarecicla.org/donde-tirar-radiografias/",
122
+ "https://reducereutilizarecicla.org/envases-ecologicos/",
123
+ "https://reducereutilizarecicla.org/donde-tirar-los-restos-de-pintura/",
124
+ "https://reducereutilizarecicla.org/valorizacion-de-residuos/",
125
+ "https://reducereutilizarecicla.org/como-reciclar-pilas/",
126
+ "https://reducereutilizarecicla.org/como-reciclar-capsulas-de-cafe/",
127
+ "https://reducereutilizarecicla.org/reciclando-cd/",
128
+ "https://reducereutilizarecicla.org/donde-tirar-neumaticos/",
129
+ "https://reducereutilizarecicla.org/como-reciclar-una-canasta-de-mimbre/",
130
+ "https://reducereutilizarecicla.org/como-funciona-el-contenedor-amarillo/",
131
+ "https://reducereutilizarecicla.org/donde-se-tiran-los-vapers/",
132
+ "https://reducereutilizarecicla.org/cuanto-tarda-una-bolsa-biodegradable-en-degradarse/",
133
+ "https://reducereutilizarecicla.org/donde-se-reciclan-los-juguetes/",
134
+ "https://reducereutilizarecicla.org/objetos-que-se-pueden-reutilizar/",
135
+ "https://reducereutilizarecicla.org/la-parafina-se-puede-reutilizar/",
136
+ "https://reducereutilizarecicla.org/planta-de-reciclaje-de-papel/",
137
+ "https://reducereutilizarecicla.org/como-saber-si-un-envase-es-reciclable/",
138
+ "https://reducereutilizarecicla.org/reutilizar-vasos-de-vela/",
139
+ "https://reducereutilizarecicla.org/bolsas-frio-calor/",
140
+ "https://reducereutilizarecicla.org/reciclar-y-reutilizar-materiales-de-construccion/",
141
+ "https://reducereutilizarecicla.org/que-es-exactamente-el-pet/",
142
+ "https://reducereutilizarecicla.org/tipos-de-reciclaje/",
143
+ "https://reducereutilizarecicla.org/que-hacer-con-palets-reciclados/",
144
+ "https://reducereutilizarecicla.org/vertederos-controlados/",
145
+ "https://reducereutilizarecicla.org/donde-tirar-escombros/",
146
+ "https://reducereutilizarecicla.org/como-reciclar-los-residuos-de-ps-poliestireno/",
147
+ "https://reducereutilizarecicla.org/tirar-la-basura-sin-bolsas/",
148
+ "https://reducereutilizarecicla.org/tirar-el-palo-de-la-fregona/",
149
+ "https://reducereutilizarecicla.org/la-mejor-manera-de-reciclar-una-pala-de-padel/",
150
+ "https://reducereutilizarecicla.org/sabes-donde-tirar-las-llantas-viejas-de-un-coche/",
151
+ "https://reducereutilizarecicla.org/sabes-donde-tirar-el-arbol-de-navidad/",
152
+ "https://reducereutilizarecicla.org/clavos-tornillos-herramientas-donde-tirar-hierro/",
153
+ "https://reducereutilizarecicla.org/donde-tirar-un-secador-de-pelo-contenedor-o-punto-limpio/",
154
+ "https://reducereutilizarecicla.org/donde-tirar-electrodomesticos/",
155
+ "https://reducereutilizarecicla.org/donde-puedo-tirar-ramas-de-arboles/",
156
+ "https://reducereutilizarecicla.org/donde-tirar-escombros/",
157
+ "https://reducereutilizarecicla.org/donde-se-tira-el-muerdago-quemado/",
158
+ "https://reducereutilizarecicla.org/sandalias-caucho-reciclado-neumaticos/",
159
+ "https://reducereutilizarecicla.org/ideas-para-reciclar-aspas-de-ventilador-de-techo/",
160
+ "https://reducereutilizarecicla.org/reciclar-sacos-dormir/",
161
+ "https://reducereutilizarecicla.org/reciclar-sillas-playa/",
162
+ "https://reducereutilizarecicla.org/donde-tirar-antipolillas/",
163
+ "https://reducereutilizarecicla.org/que-hacer-con-los-juguetes-viejos/",
164
+ "https://reducereutilizarecicla.org/como-utilizar-las-mascarillas-y-el-gel-hidroalcoholico-en-la-playa/",
165
+ "https://reducereutilizarecicla.org/ideas-para-reciclar-un-ventilador-de-pie/",
166
+ "https://reducereutilizarecicla.org/donde-tirar-gasoil/",
167
+ "https://reducereutilizarecicla.org/donde-puedo-tirar-basura-electronica/",
168
+ "https://reducereutilizarecicla.org/donde-tirar-agujas/",
169
+ "https://reducereutilizarecicla.org/donde-tirar-residuos-peligrosos/",
170
+ "https://reducereutilizarecicla.org/donde-tirar-los-cables/",
171
+ "https://reducereutilizarecicla.org/donde-tirar-bicicletas/",
172
+ "https://reducereutilizarecicla.org/donde-tirar-maletas/",
173
+ "https://reducereutilizarecicla.org/como-reciclar-una-pantalla/",
174
+ "https://reducereutilizarecicla.org/donde-tirar-ropa-usada/"
175
+ ]
176
+
177
+
178
+ @tenacity.retry(
179
+ wait=tenacity.wait_fixed(3), # wait 3 seconds between retries
180
+ stop=tenacity.stop_after_attempt(3), # stop after 3 attempts
181
+ reraise=True
182
  )
183
+ def load_url(url):
184
+ """
185
+ Use the WebBaseLoader for a single URL.
186
+ The function is retried if it fails due to connection issues.
187
+ """
188
+ loader = WebBaseLoader(
189
+ web_paths=[url],
190
+ header_template=header_template
191
+ )
192
+ return loader.load()
193
+
194
+
195
+ def safe_load_all_urls(urls):
196
+ """
197
+ Safely load documents from a list of URLs.
198
+ Any URL that fails after the specified number of retries is skipped.
199
+ """
200
+ all_docs = []
201
+ for link in urls:
202
+ try:
203
+ docs = load_url(link)
204
+ all_docs.extend(docs)
205
+ except Exception as e:
206
+ # If load_url fails after all retries, skip that URL
207
+ print(f"Skipping URL due to error: {link}\nError: {e}\n")
208
+ return all_docs
209
+
210
+
211
+ # 3.3) Actually load the data from all URLs
212
+ all_loaded_docs = safe_load_all_urls(URLS)
213
+
214
+ # 3.4) Split the documents into manageable chunks
215
  text_splitter = RecursiveCharacterTextSplitter(
216
  chunk_size=1024,
217
  chunk_overlap=150,
218
  length_function=len
219
  )
220
+ docs = text_splitter.split_documents(all_loaded_docs)
221
+
222
+ # 3.5) Create embeddings
223
  embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-small')
224
+
225
+ # 3.6) Create a persistent directory to store vector DB
226
  persist_directory = 'docs/chroma/'
227
+ shutil.rmtree(persist_directory, ignore_errors=True) # remove old DB files
228
 
229
+ # 3.7) Build Chroma vector store
 
230
  vectordb = Chroma.from_documents(
231
  documents=docs,
232
  embedding=embeddings,
233
  persist_directory=persist_directory
234
  )
 
 
235
 
236
+ # 3.8) Create a retriever
237
+ retriever = vectordb.as_retriever(
238
+ search_kwargs={"k": 2},
239
+ search_type="mmr"
240
+ )
241
+
242
+
243
+ """
244
+ =========================================================
245
+ 4) PROMPT & CHAIN SETUP
246
+ =========================================================
247
+ """
248
+ # 4.1) Define the schema for final chatbot answers
249
  class FinalAnswer(BaseModel):
250
  question: str = Field()
251
  answer: str = Field()
252
 
 
253
  parser = PydanticOutputParser(pydantic_object=FinalAnswer)
254
 
255
+ # 4.2) Prompt template: system instructions
256
  template = """
257
  Your name is Greta and you are a recycling chatbot with the objective to anwer questions from user in English or Spanish /
258
  Has sido diseñado y creado por el Grupo 1 del Máster en Data Science & Big Data de la promoción 2023/2024 de la Universidad Complutense de Madrid. Este grupo está fromado por Rocío, María Guillermo, Alejandra, Paloma y Álvaro /
 
266
  {format_instructions}
267
  """
268
 
 
269
  sys_prompt = SystemMessagePromptTemplate.from_template(template)
270
  qa_prompt = ChatPromptTemplate(
271
  messages=[
272
  sys_prompt,
273
+ HumanMessagePromptTemplate.from_template("{question}")
274
+ ],
275
  partial_variables={"format_instructions": parser.get_format_instructions()}
276
  )
277
+
278
+ # 4.3) Define the LLM from HuggingFace
279
  llm = HuggingFaceHub(
280
  repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
281
  task="text-generation",
 
287
  },
288
  )
289
 
290
+ # 4.4) Create a ConversationalRetrievalChain that uses the above LLM
291
  qa_chain = ConversationalRetrievalChain.from_llm(
292
+ llm=llm,
293
+ memory=ConversationBufferMemory(
294
+ llm=llm,
295
+ memory_key="chat_history",
296
+ input_key='question',
297
+ output_key='output'
298
+ ),
299
+ retriever=retriever,
300
+ verbose=True,
301
  combine_docs_chain_kwargs={'prompt': qa_prompt},
302
+ get_chat_history=lambda h : h, # pass memory directly
303
+ rephrase_question=False,
304
+ output_key='output'
305
  )
306
 
307
+
308
+ def chat_interface(question, history):
309
+ """
310
+ This function processes the user's question through the qa_chain,
311
+ then parses out the final answer from the chain's output.
312
+ """
313
  result = qa_chain.invoke({'question': question})
314
  output_string = result['output']
315
 
316
+ # Find the index of the last occurrence of '"answer":' in the string
317
  answer_index = output_string.rfind('"answer":')
 
 
318
  answer_part = output_string[answer_index + len('"answer":'):].strip()
319
 
320
  # Find the next occurrence of a double quote to get the start of the answer value
321
  quote_index = answer_part.find('"')
 
 
322
  answer_value = answer_part[quote_index + 1:answer_part.find('"', quote_index + 1)]
323
 
324
  return answer_value
325
 
326
 
327
+ # Gradio chat interface for the chatbot
328
  chatbot_gradio_app = gr.ChatInterface(
329
  fn=chat_interface,
330
+ title="<span style='color: rgb(243, 239, 224);'>Green Greta</span>"
331
  )
332
 
333
+
334
+ """
335
+ =========================================================
336
+ 5) BANNER / WELCOME TAB
337
+ =========================================================
338
+ """
339
  banner_tab_content = """
340
  <div style="background-color: #d3e3c3; text-align: center; padding: 20px; display: flex; flex-direction: column; align-items: center;">
341
  <img src="https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/resolve/main/front_4.jpg" alt="Banner Image" style="width: 50%; max-width: 500px; margin: 0 auto;">
 
359
  """
360
  banner_tab = gr.Markdown(banner_tab_content)
361
 
362
+
363
+ """
364
+ =========================================================
365
+ 6) GRADIO FINAL APP: TABS
366
+ =========================================================
367
+ """
368
  app = gr.TabbedInterface(
369
  [banner_tab, image_gradio_app, chatbot_gradio_app],
370
  tab_names=["Welcome to Green Greta", "Green Greta Image Classification", "Green Greta Chat"],
371
  theme=theme
372
  )
373
 
374
+ # Enable queue() for concurrency and launch the Gradio app
375
  app.queue()
376
+ app.launch()