Spaces:
Running
Running
update langchain
Browse files
app.py
CHANGED
@@ -1,197 +1,258 @@
|
|
1 |
# app.py
|
2 |
|
|
|
|
|
|
|
|
|
|
|
3 |
import gradio as gr
|
4 |
-
|
5 |
import torch
|
6 |
import theme
|
7 |
theme = theme.Theme()
|
|
|
8 |
from huggingface_hub import from_pretrained_keras
|
9 |
from tensorflow.keras.applications import EfficientNetB0
|
10 |
-
|
11 |
import tensorflow as tf
|
12 |
from tensorflow import keras
|
|
|
13 |
from PIL import Image
|
14 |
-
from pydantic.v1 import BaseModel, Field
|
15 |
import shutil
|
16 |
-
import tenacity
|
17 |
|
18 |
-
#
|
19 |
-
from
|
|
|
|
|
|
|
20 |
from langchain.embeddings import HuggingFaceEmbeddings
|
21 |
from langchain.prompts import PromptTemplate
|
22 |
-
from langchain.chains import RetrievalQA
|
23 |
-
from langchain.prompts import ChatPromptTemplate
|
24 |
from langchain.schema import StrOutputParser
|
25 |
from langchain.schema.runnable import Runnable
|
26 |
from langchain.schema.runnable.config import RunnableConfig
|
27 |
-
from langchain.chains import
|
28 |
-
LLMChain, ConversationalRetrievalChain)
|
29 |
-
from langchain.vectorstores import Chroma
|
30 |
-
from langchain.memory import ConversationBufferMemory
|
31 |
-
from langchain.chains import LLMChain
|
32 |
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
|
33 |
-
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate,
|
34 |
from langchain.output_parsers import PydanticOutputParser
|
35 |
from langchain_community.llms import HuggingFaceHub
|
36 |
from langchain_community.document_loaders import WebBaseLoader
|
|
|
|
|
37 |
|
38 |
-
from
|
39 |
-
|
40 |
-
custom_title = "<span style='color: rgb(243, 239, 224);'>Green Greta</span>"
|
41 |
-
|
42 |
|
43 |
-
# Cell 1: Image Classification Model
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
model1 = from_pretrained_keras("rocioadlc/efficientnetB0_trash")
|
46 |
|
47 |
-
# Define class labels
|
48 |
class_labels = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
image_array = tf.keras.applications.efficientnet.preprocess_input(image_array)
|
56 |
-
# Expand the dimensions to create a batch
|
57 |
image_array = tf.expand_dims(image_array, 0)
|
58 |
-
#
|
59 |
predictions = model1.predict(image_array)
|
|
|
|
|
60 |
category_scores = {}
|
61 |
for i, class_label in enumerate(class_labels):
|
62 |
category_scores[class_label] = predictions[0][i].item()
|
63 |
|
64 |
return category_scores
|
65 |
|
66 |
-
|
67 |
image_gradio_app = gr.Interface(
|
68 |
fn=predict_image,
|
69 |
inputs=gr.Image(label="Image", sources=['upload', 'webcam'], type="pil"),
|
70 |
outputs=[gr.Label(label="Result")],
|
71 |
-
title=
|
72 |
theme=theme
|
73 |
)
|
74 |
|
75 |
-
# Cell 2: ChatBot Model
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
78 |
user_agent = UserAgent().random
|
79 |
header_template = {"User-Agent": user_agent}
|
80 |
|
81 |
-
#
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
162 |
)
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
text_splitter = RecursiveCharacterTextSplitter(
|
168 |
chunk_size=1024,
|
169 |
chunk_overlap=150,
|
170 |
length_function=len
|
171 |
)
|
172 |
-
docs = text_splitter.split_documents(
|
173 |
-
|
|
|
174 |
embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-small')
|
175 |
-
|
|
|
176 |
persist_directory = 'docs/chroma/'
|
|
|
177 |
|
178 |
-
#
|
179 |
-
shutil.rmtree(persist_directory, ignore_errors=True)
|
180 |
vectordb = Chroma.from_documents(
|
181 |
documents=docs,
|
182 |
embedding=embeddings,
|
183 |
persist_directory=persist_directory
|
184 |
)
|
185 |
-
# define retriever
|
186 |
-
retriever = vectordb.as_retriever(search_kwargs={"k": 2}, search_type="mmr")
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
class FinalAnswer(BaseModel):
|
189 |
question: str = Field()
|
190 |
answer: str = Field()
|
191 |
|
192 |
-
# Assuming you have a parser for the FinalAnswer class
|
193 |
parser = PydanticOutputParser(pydantic_object=FinalAnswer)
|
194 |
|
|
|
195 |
template = """
|
196 |
Your name is Greta and you are a recycling chatbot with the objective to anwer questions from user in English or Spanish /
|
197 |
Has sido diseñado y creado por el Grupo 1 del Máster en Data Science & Big Data de la promoción 2023/2024 de la Universidad Complutense de Madrid. Este grupo está fromado por Rocío, María Guillermo, Alejandra, Paloma y Álvaro /
|
@@ -205,14 +266,16 @@ User: {question}
|
|
205 |
{format_instructions}
|
206 |
"""
|
207 |
|
208 |
-
# Create the chat prompt templates
|
209 |
sys_prompt = SystemMessagePromptTemplate.from_template(template)
|
210 |
qa_prompt = ChatPromptTemplate(
|
211 |
messages=[
|
212 |
sys_prompt,
|
213 |
-
HumanMessagePromptTemplate.from_template("{question}")
|
|
|
214 |
partial_variables={"format_instructions": parser.get_format_instructions()}
|
215 |
)
|
|
|
|
|
216 |
llm = HuggingFaceHub(
|
217 |
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
|
218 |
task="text-generation",
|
@@ -224,42 +287,55 @@ llm = HuggingFaceHub(
|
|
224 |
},
|
225 |
)
|
226 |
|
|
|
227 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
228 |
-
llm
|
229 |
-
memory
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
232 |
combine_docs_chain_kwargs={'prompt': qa_prompt},
|
233 |
-
get_chat_history
|
234 |
-
rephrase_question
|
235 |
-
output_key
|
236 |
)
|
237 |
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
239 |
result = qa_chain.invoke({'question': question})
|
240 |
output_string = result['output']
|
241 |
|
242 |
-
# Find the index of the last occurrence of "answer": in the string
|
243 |
answer_index = output_string.rfind('"answer":')
|
244 |
-
|
245 |
-
# Extract the substring starting from the "answer": index
|
246 |
answer_part = output_string[answer_index + len('"answer":'):].strip()
|
247 |
|
248 |
# Find the next occurrence of a double quote to get the start of the answer value
|
249 |
quote_index = answer_part.find('"')
|
250 |
-
|
251 |
-
# Extract the answer value between double quotes
|
252 |
answer_value = answer_part[quote_index + 1:answer_part.find('"', quote_index + 1)]
|
253 |
|
254 |
return answer_value
|
255 |
|
256 |
|
|
|
257 |
chatbot_gradio_app = gr.ChatInterface(
|
258 |
fn=chat_interface,
|
259 |
-
title=
|
260 |
)
|
261 |
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
263 |
banner_tab_content = """
|
264 |
<div style="background-color: #d3e3c3; text-align: center; padding: 20px; display: flex; flex-direction: column; align-items: center;">
|
265 |
<img src="https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/resolve/main/front_4.jpg" alt="Banner Image" style="width: 50%; max-width: 500px; margin: 0 auto;">
|
@@ -283,12 +359,18 @@ banner_tab_content = """
|
|
283 |
"""
|
284 |
banner_tab = gr.Markdown(banner_tab_content)
|
285 |
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
287 |
app = gr.TabbedInterface(
|
288 |
[banner_tab, image_gradio_app, chatbot_gradio_app],
|
289 |
tab_names=["Welcome to Green Greta", "Green Greta Image Classification", "Green Greta Chat"],
|
290 |
theme=theme
|
291 |
)
|
292 |
|
|
|
293 |
app.queue()
|
294 |
-
app.launch()
|
|
|
1 |
# app.py
|
2 |
|
3 |
+
"""
|
4 |
+
=========================================================
|
5 |
+
1) IMPORTS & DEPENDENCIES
|
6 |
+
=========================================================
|
7 |
+
"""
|
8 |
import gradio as gr
|
|
|
9 |
import torch
|
10 |
import theme
|
11 |
theme = theme.Theme()
|
12 |
+
|
13 |
from huggingface_hub import from_pretrained_keras
|
14 |
from tensorflow.keras.applications import EfficientNetB0
|
|
|
15 |
import tensorflow as tf
|
16 |
from tensorflow import keras
|
17 |
+
|
18 |
from PIL import Image
|
|
|
19 |
import shutil
|
|
|
20 |
|
21 |
+
import tenacity # for retrying failed requests
|
22 |
+
from fake_useragent import UserAgent
|
23 |
+
|
24 |
+
# LangChain
|
25 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
26 |
from langchain.embeddings import HuggingFaceEmbeddings
|
27 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
28 |
from langchain.schema import StrOutputParser
|
29 |
from langchain.schema.runnable import Runnable
|
30 |
from langchain.schema.runnable.config import RunnableConfig
|
31 |
+
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, LLMChain
|
|
|
|
|
|
|
|
|
32 |
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
|
33 |
+
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
|
34 |
from langchain.output_parsers import PydanticOutputParser
|
35 |
from langchain_community.llms import HuggingFaceHub
|
36 |
from langchain_community.document_loaders import WebBaseLoader
|
37 |
+
from langchain.vectorstores import Chroma
|
38 |
+
from langchain.memory import ConversationBufferMemory
|
39 |
|
40 |
+
from pydantic.v1 import BaseModel, Field
|
|
|
|
|
|
|
41 |
|
|
|
42 |
|
43 |
+
"""
|
44 |
+
=========================================================
|
45 |
+
2) IMAGE CLASSIFICATION MODEL SETUP
|
46 |
+
=========================================================
|
47 |
+
"""
|
48 |
+
# Load a Keras model from HuggingFace Hub
|
49 |
model1 = from_pretrained_keras("rocioadlc/efficientnetB0_trash")
|
50 |
|
51 |
+
# Define class labels for the trash classification
|
52 |
class_labels = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
|
53 |
|
54 |
+
def predict_image(input_image):
|
55 |
+
"""
|
56 |
+
Resize the user-uploaded image and preprocess it so that it can be fed
|
57 |
+
into the EfficientNetB0 model. The model then returns a dictionary of
|
58 |
+
class probabilities.
|
59 |
+
"""
|
60 |
+
# Resize the image (note the target dimensions)
|
61 |
+
image_array = tf.keras.preprocessing.image.img_to_array(
|
62 |
+
input_image.resize((244, 224))
|
63 |
+
)
|
64 |
+
# Normalize/prescale the image for EfficientNet
|
65 |
image_array = tf.keras.applications.efficientnet.preprocess_input(image_array)
|
66 |
+
# Expand the dimensions to create a batch of size 1
|
67 |
image_array = tf.expand_dims(image_array, 0)
|
68 |
+
# Get predictions
|
69 |
predictions = model1.predict(image_array)
|
70 |
+
|
71 |
+
# Convert predictions into a dictionary {class_label: score}
|
72 |
category_scores = {}
|
73 |
for i, class_label in enumerate(class_labels):
|
74 |
category_scores[class_label] = predictions[0][i].item()
|
75 |
|
76 |
return category_scores
|
77 |
|
78 |
+
# Gradio interface for image classification
|
79 |
image_gradio_app = gr.Interface(
|
80 |
fn=predict_image,
|
81 |
inputs=gr.Image(label="Image", sources=['upload', 'webcam'], type="pil"),
|
82 |
outputs=[gr.Label(label="Result")],
|
83 |
+
title="<span style='color: rgb(243, 239, 224);'>Green Greta</span>",
|
84 |
theme=theme
|
85 |
)
|
86 |
|
|
|
87 |
|
88 |
+
"""
|
89 |
+
=========================================================
|
90 |
+
3) CHATBOT MODEL SETUP
|
91 |
+
=========================================================
|
92 |
+
"""
|
93 |
+
# 3.1) Define user agent to avoid blocking, etc.
|
94 |
user_agent = UserAgent().random
|
95 |
header_template = {"User-Agent": user_agent}
|
96 |
|
97 |
+
# 3.2) List of URLs to load for retrieval
|
98 |
+
URLS = [
|
99 |
+
"https://www.epa.gov/recycle/frequent-questions-recycling",
|
100 |
+
"https://www.whitehorsedc.gov.uk/vale-of-white-horse-district-council/recycling-rubbish-and-waste/lets-get-real-about-recycling/",
|
101 |
+
"https://www.teimas.com/blog/13-preguntas-y-respuestas-sobre-la-ley-de-residuos-07-2022",
|
102 |
+
"https://www.molok.com/es/blog/gestion-de-residuos-solidos-urbanos-rsu-10-dudas-comunes",
|
103 |
+
"https://espanol.epa.gov/espanol/el-reciclaje#valelapena",
|
104 |
+
"https://espanol.epa.gov/espanol/preguntas-frecuentes-sobre-reciclado-de-plastico-y-elaboracion-de-abono-vegetal",
|
105 |
+
"https://espanol.epa.gov/espanol/consejo-del-dia-como-reciclo-mis",
|
106 |
+
"https://espanol.epa.gov/espanol/recursos-para-reciclar-dispositivos-electronicos",
|
107 |
+
"https://www.epa.gov/recycle/electronics-donation-and-recycling",
|
108 |
+
"https://reducereutilizarecicla.org/que-es-el-reciclaje/",
|
109 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/",
|
110 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-amarillo/",
|
111 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-azul/",
|
112 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-verde/",
|
113 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-marron-organico/",
|
114 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-gris-restos/",
|
115 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/punto-limpio/",
|
116 |
+
"https://reducereutilizarecicla.org/donde-tirar-auriculares/",
|
117 |
+
"https://reducereutilizarecicla.org/donde-tirar-sartenes/",
|
118 |
+
"https://reducereutilizarecicla.org/donde-tirar-aceite-usado/",
|
119 |
+
"https://reducereutilizarecicla.org/como-se-reciclan-los-envases-tipo-brik/",
|
120 |
+
"https://reducereutilizarecicla.org/los-envases-del-verano/",
|
121 |
+
"https://reducereutilizarecicla.org/donde-tirar-radiografias/",
|
122 |
+
"https://reducereutilizarecicla.org/envases-ecologicos/",
|
123 |
+
"https://reducereutilizarecicla.org/donde-tirar-los-restos-de-pintura/",
|
124 |
+
"https://reducereutilizarecicla.org/valorizacion-de-residuos/",
|
125 |
+
"https://reducereutilizarecicla.org/como-reciclar-pilas/",
|
126 |
+
"https://reducereutilizarecicla.org/como-reciclar-capsulas-de-cafe/",
|
127 |
+
"https://reducereutilizarecicla.org/reciclando-cd/",
|
128 |
+
"https://reducereutilizarecicla.org/donde-tirar-neumaticos/",
|
129 |
+
"https://reducereutilizarecicla.org/como-reciclar-una-canasta-de-mimbre/",
|
130 |
+
"https://reducereutilizarecicla.org/como-funciona-el-contenedor-amarillo/",
|
131 |
+
"https://reducereutilizarecicla.org/donde-se-tiran-los-vapers/",
|
132 |
+
"https://reducereutilizarecicla.org/cuanto-tarda-una-bolsa-biodegradable-en-degradarse/",
|
133 |
+
"https://reducereutilizarecicla.org/donde-se-reciclan-los-juguetes/",
|
134 |
+
"https://reducereutilizarecicla.org/objetos-que-se-pueden-reutilizar/",
|
135 |
+
"https://reducereutilizarecicla.org/la-parafina-se-puede-reutilizar/",
|
136 |
+
"https://reducereutilizarecicla.org/planta-de-reciclaje-de-papel/",
|
137 |
+
"https://reducereutilizarecicla.org/como-saber-si-un-envase-es-reciclable/",
|
138 |
+
"https://reducereutilizarecicla.org/reutilizar-vasos-de-vela/",
|
139 |
+
"https://reducereutilizarecicla.org/bolsas-frio-calor/",
|
140 |
+
"https://reducereutilizarecicla.org/reciclar-y-reutilizar-materiales-de-construccion/",
|
141 |
+
"https://reducereutilizarecicla.org/que-es-exactamente-el-pet/",
|
142 |
+
"https://reducereutilizarecicla.org/tipos-de-reciclaje/",
|
143 |
+
"https://reducereutilizarecicla.org/que-hacer-con-palets-reciclados/",
|
144 |
+
"https://reducereutilizarecicla.org/vertederos-controlados/",
|
145 |
+
"https://reducereutilizarecicla.org/donde-tirar-escombros/",
|
146 |
+
"https://reducereutilizarecicla.org/como-reciclar-los-residuos-de-ps-poliestireno/",
|
147 |
+
"https://reducereutilizarecicla.org/tirar-la-basura-sin-bolsas/",
|
148 |
+
"https://reducereutilizarecicla.org/tirar-el-palo-de-la-fregona/",
|
149 |
+
"https://reducereutilizarecicla.org/la-mejor-manera-de-reciclar-una-pala-de-padel/",
|
150 |
+
"https://reducereutilizarecicla.org/sabes-donde-tirar-las-llantas-viejas-de-un-coche/",
|
151 |
+
"https://reducereutilizarecicla.org/sabes-donde-tirar-el-arbol-de-navidad/",
|
152 |
+
"https://reducereutilizarecicla.org/clavos-tornillos-herramientas-donde-tirar-hierro/",
|
153 |
+
"https://reducereutilizarecicla.org/donde-tirar-un-secador-de-pelo-contenedor-o-punto-limpio/",
|
154 |
+
"https://reducereutilizarecicla.org/donde-tirar-electrodomesticos/",
|
155 |
+
"https://reducereutilizarecicla.org/donde-puedo-tirar-ramas-de-arboles/",
|
156 |
+
"https://reducereutilizarecicla.org/donde-tirar-escombros/",
|
157 |
+
"https://reducereutilizarecicla.org/donde-se-tira-el-muerdago-quemado/",
|
158 |
+
"https://reducereutilizarecicla.org/sandalias-caucho-reciclado-neumaticos/",
|
159 |
+
"https://reducereutilizarecicla.org/ideas-para-reciclar-aspas-de-ventilador-de-techo/",
|
160 |
+
"https://reducereutilizarecicla.org/reciclar-sacos-dormir/",
|
161 |
+
"https://reducereutilizarecicla.org/reciclar-sillas-playa/",
|
162 |
+
"https://reducereutilizarecicla.org/donde-tirar-antipolillas/",
|
163 |
+
"https://reducereutilizarecicla.org/que-hacer-con-los-juguetes-viejos/",
|
164 |
+
"https://reducereutilizarecicla.org/como-utilizar-las-mascarillas-y-el-gel-hidroalcoholico-en-la-playa/",
|
165 |
+
"https://reducereutilizarecicla.org/ideas-para-reciclar-un-ventilador-de-pie/",
|
166 |
+
"https://reducereutilizarecicla.org/donde-tirar-gasoil/",
|
167 |
+
"https://reducereutilizarecicla.org/donde-puedo-tirar-basura-electronica/",
|
168 |
+
"https://reducereutilizarecicla.org/donde-tirar-agujas/",
|
169 |
+
"https://reducereutilizarecicla.org/donde-tirar-residuos-peligrosos/",
|
170 |
+
"https://reducereutilizarecicla.org/donde-tirar-los-cables/",
|
171 |
+
"https://reducereutilizarecicla.org/donde-tirar-bicicletas/",
|
172 |
+
"https://reducereutilizarecicla.org/donde-tirar-maletas/",
|
173 |
+
"https://reducereutilizarecicla.org/como-reciclar-una-pantalla/",
|
174 |
+
"https://reducereutilizarecicla.org/donde-tirar-ropa-usada/"
|
175 |
+
]
|
176 |
+
|
177 |
+
|
178 |
+
@tenacity.retry(
|
179 |
+
wait=tenacity.wait_fixed(3), # wait 3 seconds between retries
|
180 |
+
stop=tenacity.stop_after_attempt(3), # stop after 3 attempts
|
181 |
+
reraise=True
|
182 |
)
|
183 |
+
def load_url(url):
|
184 |
+
"""
|
185 |
+
Use the WebBaseLoader for a single URL.
|
186 |
+
The function is retried if it fails due to connection issues.
|
187 |
+
"""
|
188 |
+
loader = WebBaseLoader(
|
189 |
+
web_paths=[url],
|
190 |
+
header_template=header_template
|
191 |
+
)
|
192 |
+
return loader.load()
|
193 |
+
|
194 |
+
|
195 |
+
def safe_load_all_urls(urls):
|
196 |
+
"""
|
197 |
+
Safely load documents from a list of URLs.
|
198 |
+
Any URL that fails after the specified number of retries is skipped.
|
199 |
+
"""
|
200 |
+
all_docs = []
|
201 |
+
for link in urls:
|
202 |
+
try:
|
203 |
+
docs = load_url(link)
|
204 |
+
all_docs.extend(docs)
|
205 |
+
except Exception as e:
|
206 |
+
# If load_url fails after all retries, skip that URL
|
207 |
+
print(f"Skipping URL due to error: {link}\nError: {e}\n")
|
208 |
+
return all_docs
|
209 |
+
|
210 |
+
|
211 |
+
# 3.3) Actually load the data from all URLs
|
212 |
+
all_loaded_docs = safe_load_all_urls(URLS)
|
213 |
+
|
214 |
+
# 3.4) Split the documents into manageable chunks
|
215 |
text_splitter = RecursiveCharacterTextSplitter(
|
216 |
chunk_size=1024,
|
217 |
chunk_overlap=150,
|
218 |
length_function=len
|
219 |
)
|
220 |
+
docs = text_splitter.split_documents(all_loaded_docs)
|
221 |
+
|
222 |
+
# 3.5) Create embeddings
|
223 |
embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-small')
|
224 |
+
|
225 |
+
# 3.6) Create a persistent directory to store vector DB
|
226 |
persist_directory = 'docs/chroma/'
|
227 |
+
shutil.rmtree(persist_directory, ignore_errors=True) # remove old DB files
|
228 |
|
229 |
+
# 3.7) Build Chroma vector store
|
|
|
230 |
vectordb = Chroma.from_documents(
|
231 |
documents=docs,
|
232 |
embedding=embeddings,
|
233 |
persist_directory=persist_directory
|
234 |
)
|
|
|
|
|
235 |
|
236 |
+
# 3.8) Create a retriever
|
237 |
+
retriever = vectordb.as_retriever(
|
238 |
+
search_kwargs={"k": 2},
|
239 |
+
search_type="mmr"
|
240 |
+
)
|
241 |
+
|
242 |
+
|
243 |
+
"""
|
244 |
+
=========================================================
|
245 |
+
4) PROMPT & CHAIN SETUP
|
246 |
+
=========================================================
|
247 |
+
"""
|
248 |
+
# 4.1) Define the schema for final chatbot answers
|
249 |
class FinalAnswer(BaseModel):
|
250 |
question: str = Field()
|
251 |
answer: str = Field()
|
252 |
|
|
|
253 |
parser = PydanticOutputParser(pydantic_object=FinalAnswer)
|
254 |
|
255 |
+
# 4.2) Prompt template: system instructions
|
256 |
template = """
|
257 |
Your name is Greta and you are a recycling chatbot with the objective to anwer questions from user in English or Spanish /
|
258 |
Has sido diseñado y creado por el Grupo 1 del Máster en Data Science & Big Data de la promoción 2023/2024 de la Universidad Complutense de Madrid. Este grupo está fromado por Rocío, María Guillermo, Alejandra, Paloma y Álvaro /
|
|
|
266 |
{format_instructions}
|
267 |
"""
|
268 |
|
|
|
269 |
sys_prompt = SystemMessagePromptTemplate.from_template(template)
|
270 |
qa_prompt = ChatPromptTemplate(
|
271 |
messages=[
|
272 |
sys_prompt,
|
273 |
+
HumanMessagePromptTemplate.from_template("{question}")
|
274 |
+
],
|
275 |
partial_variables={"format_instructions": parser.get_format_instructions()}
|
276 |
)
|
277 |
+
|
278 |
+
# 4.3) Define the LLM from HuggingFace
|
279 |
llm = HuggingFaceHub(
|
280 |
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
|
281 |
task="text-generation",
|
|
|
287 |
},
|
288 |
)
|
289 |
|
290 |
+
# 4.4) Create a ConversationalRetrievalChain that uses the above LLM
|
291 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
292 |
+
llm=llm,
|
293 |
+
memory=ConversationBufferMemory(
|
294 |
+
llm=llm,
|
295 |
+
memory_key="chat_history",
|
296 |
+
input_key='question',
|
297 |
+
output_key='output'
|
298 |
+
),
|
299 |
+
retriever=retriever,
|
300 |
+
verbose=True,
|
301 |
combine_docs_chain_kwargs={'prompt': qa_prompt},
|
302 |
+
get_chat_history=lambda h : h, # pass memory directly
|
303 |
+
rephrase_question=False,
|
304 |
+
output_key='output'
|
305 |
)
|
306 |
|
307 |
+
|
308 |
+
def chat_interface(question, history):
|
309 |
+
"""
|
310 |
+
This function processes the user's question through the qa_chain,
|
311 |
+
then parses out the final answer from the chain's output.
|
312 |
+
"""
|
313 |
result = qa_chain.invoke({'question': question})
|
314 |
output_string = result['output']
|
315 |
|
316 |
+
# Find the index of the last occurrence of '"answer":' in the string
|
317 |
answer_index = output_string.rfind('"answer":')
|
|
|
|
|
318 |
answer_part = output_string[answer_index + len('"answer":'):].strip()
|
319 |
|
320 |
# Find the next occurrence of a double quote to get the start of the answer value
|
321 |
quote_index = answer_part.find('"')
|
|
|
|
|
322 |
answer_value = answer_part[quote_index + 1:answer_part.find('"', quote_index + 1)]
|
323 |
|
324 |
return answer_value
|
325 |
|
326 |
|
327 |
+
# Gradio chat interface for the chatbot
|
328 |
chatbot_gradio_app = gr.ChatInterface(
|
329 |
fn=chat_interface,
|
330 |
+
title="<span style='color: rgb(243, 239, 224);'>Green Greta</span>"
|
331 |
)
|
332 |
|
333 |
+
|
334 |
+
"""
|
335 |
+
=========================================================
|
336 |
+
5) BANNER / WELCOME TAB
|
337 |
+
=========================================================
|
338 |
+
"""
|
339 |
banner_tab_content = """
|
340 |
<div style="background-color: #d3e3c3; text-align: center; padding: 20px; display: flex; flex-direction: column; align-items: center;">
|
341 |
<img src="https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/resolve/main/front_4.jpg" alt="Banner Image" style="width: 50%; max-width: 500px; margin: 0 auto;">
|
|
|
359 |
"""
|
360 |
banner_tab = gr.Markdown(banner_tab_content)
|
361 |
|
362 |
+
|
363 |
+
"""
|
364 |
+
=========================================================
|
365 |
+
6) GRADIO FINAL APP: TABS
|
366 |
+
=========================================================
|
367 |
+
"""
|
368 |
app = gr.TabbedInterface(
|
369 |
[banner_tab, image_gradio_app, chatbot_gradio_app],
|
370 |
tab_names=["Welcome to Green Greta", "Green Greta Image Classification", "Green Greta Chat"],
|
371 |
theme=theme
|
372 |
)
|
373 |
|
374 |
+
# Enable queue() for concurrency and launch the Gradio app
|
375 |
app.queue()
|
376 |
+
app.launch()
|