IProject-10 commited on
Commit
8af136f
Β·
verified Β·
1 Parent(s): 32dad47

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +283 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('punkt')
3
+ nltk.download('punkt_tab')
4
+
5
+ # SECTIONED URL LIST (in case we want to tag later)
6
+ url_dict = {
7
+ "Website Designing": [
8
+ "https://www.imageonline.co.in/website-designing-mumbai.html",
9
+ "https://www.imageonline.co.in/domain-hosting-services-india.html",
10
+ "https://www.imageonline.co.in/best-seo-company-mumbai.html",
11
+ "https://www.imageonline.co.in/wordpress-blog-designing-india.html",
12
+ "https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
13
+ "https://www.imageonline.co.in/website-template-customization-india.html",
14
+ "https://www.imageonline.co.in/regular-website-maintanence-services.html",
15
+ "https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
16
+ "https://www.imageonline.co.in/web-application-screen-designing.html"
17
+ ],
18
+ "Website Development": [
19
+ "https://www.imageonline.co.in/website-development-mumbai.html",
20
+ "https://www.imageonline.co.in/open-source-customization.html",
21
+ "https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
22
+ "https://www.imageonline.co.in/website-with-content-management-system.html",
23
+ "https://www.imageonline.co.in/web-application-development-india.html"
24
+ ],
25
+ "Mobile App Development": [
26
+ "https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
27
+ ],
28
+ "About Us": [
29
+ "https://www.imageonline.co.in/about-us.html",
30
+ "https://www.imageonline.co.in/vision.html",
31
+ "https://www.imageonline.co.in/team.html"
32
+ ],
33
+ "Testimonials": [
34
+ "https://www.imageonline.co.in/testimonial.html"
35
+ ]
36
+ }
37
+
38
+ import trafilatura
39
+ import requests
40
+
41
+ # Function to extract clean text using trafilatura
42
+ def extract_clean_text(url):
43
+ """
44
+ Fetch and extract clean main content from a URL using trafilatura.
45
+ Returns None if content couldn't be extracted.
46
+ """
47
+ try:
48
+ downloaded = trafilatura.fetch_url(url)
49
+ if downloaded:
50
+ content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
51
+ return content
52
+ except Exception as e:
53
+ print(f"Error fetching {url}: {e}")
54
+ return None
55
+
56
+ # Scrape data and prepare for RAG with metadata
57
+ scraped_data = []
58
+
59
+ for section, urls in url_dict.items():
60
+ for url in urls:
61
+ print(f"🟩 Scraping: {url}")
62
+ text = extract_clean_text(url)
63
+ if text:
64
+ print(f"βœ… Extracted {len(text)} characters.\n")
65
+ scraped_data.append({
66
+ "content": text,
67
+ "metadata": {
68
+ "source": url,
69
+ "section": section
70
+ }
71
+ })
72
+ else:
73
+ print(f"❌ Failed to extract content from {url}.\n")
74
+
75
+ print(f"Total pages scraped: {len(scraped_data)}")
76
+
77
+ import tiktoken
78
+ from nltk.tokenize import sent_tokenize
79
+
80
+ # Initialize GPT tokenizer (cl100k_base works with Together.ai and OpenAI APIs)
81
+ tokenizer = tiktoken.get_encoding("cl100k_base")
82
+
83
+ def chunk_text(text, max_tokens=400):
84
+ """
85
+ Chunk text into overlapping segments based on sentence boundaries and token limits.
86
+ """
87
+ sentences = sent_tokenize(text)
88
+ chunks = []
89
+ current_chunk = []
90
+
91
+ for sentence in sentences:
92
+ current_chunk.append(sentence)
93
+ tokens = tokenizer.encode(" ".join(current_chunk))
94
+ if len(tokens) > max_tokens:
95
+ # Finalize current chunk without last sentence
96
+ current_chunk.pop()
97
+ chunks.append(" ".join(current_chunk).strip())
98
+ current_chunk = [sentence] # Start new chunk with overflow sentence
99
+
100
+ # Append final chunk
101
+ if current_chunk:
102
+ chunks.append(" ".join(current_chunk).strip())
103
+
104
+ return chunks
105
+
106
+ chunked_data = []
107
+
108
+ for item in scraped_data:
109
+ text = item["content"]
110
+ metadata = item["metadata"]
111
+
112
+ chunks = chunk_text(text, max_tokens=400)
113
+
114
+ for chunk in chunks:
115
+ chunked_data.append({
116
+ "content": chunk,
117
+ "metadata": metadata # Keep the same URL + section for each chunk
118
+ })
119
+
120
+ # Extract text chunks from chunked_data for embedding
121
+ texts_to_embed = [item["content"] for item in chunked_data]
122
+
123
+ from sentence_transformers import SentenceTransformer
124
+
125
+ # Load the embedding model
126
+ embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
127
+
128
+ def embed_chunks(text_list, model):
129
+ """
130
+ Generate embeddings for a list of text chunks.
131
+ """
132
+ return model.encode(text_list, convert_to_numpy=True)
133
+
134
+ # Generate embeddings
135
+ embeddings = embed_chunks(texts_to_embed, embedding_model)
136
+
137
+ print(f"βœ… Generated {len(embeddings)} embeddings")
138
+ print(f"πŸ”Ή Shape of first embedding: {embeddings[0].shape}")
139
+
140
+ import chromadb
141
+ import uuid
142
+
143
+ # Initialize ChromaDB client (persistent storage)
144
+ chroma_client = chromadb.PersistentClient(path="./chroma_store")
145
+
146
+ # Create or get collection
147
+ collection = chroma_client.get_or_create_collection(name="imageonline_chunks")
148
+
149
+ # Extract documents, embeddings, metadatas
150
+ documents = [item["content"] for item in chunked_data]
151
+ metadatas = [item["metadata"] for item in chunked_data]
152
+ ids = [str(uuid.uuid4()) for _ in documents]
153
+
154
+ # Safety check
155
+ assert len(documents) == len(embeddings) == len(metadatas), "Data length mismatch!"
156
+
157
+ # Add to ChromaDB
158
+ collection.add(
159
+ documents=documents,
160
+ embeddings=embeddings.tolist(),
161
+ metadatas=metadatas,
162
+ ids=ids
163
+ )
164
+
165
+ # Sample query
166
+ query = "web design company"
167
+ query_embedding = embedding_model.encode([query])[0]
168
+
169
+ # Query ChromaDB
170
+ results = collection.query(
171
+ query_embeddings=[query_embedding.tolist()],
172
+ n_results=3
173
+ )
174
+
175
+ # Display results
176
+ for i in range(len(results['documents'][0])):
177
+ print(f"\nπŸ” Match {i+1}:")
178
+ print(f"Content: {results['documents'][0][i][:200]}...")
179
+ print(f"πŸ“Ž Metadata: {results['metadatas'][0][i]}")
180
+
181
+ from langchain_core.prompts import ChatPromptTemplate
182
+ from langchain_core.runnables import RunnableLambda, RunnablePassthrough
183
+ from langchain_core.output_parsers import StrOutputParser
184
+ from langchain_together import ChatTogether
185
+
186
+ from langchain_community.vectorstores import Chroma
187
+ from langchain_community.embeddings import HuggingFaceEmbeddings
188
+
189
+ # Initialize vectorstore
190
+ embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
191
+
192
+ vectorstore = Chroma(
193
+ client=chroma_client, # from your previous chroma setup
194
+ collection_name="imageonline_chunks",
195
+ embedding_function=embedding_function
196
+ )
197
+
198
+ # Create retriever
199
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
200
+
201
+ def retrieve_and_format(query):
202
+ docs = retriever.get_relevant_documents(query)
203
+
204
+ context_strings = []
205
+ for doc in docs:
206
+ content = doc.page_content
207
+ metadata = doc.metadata
208
+ source = metadata.get("source", "")
209
+ section = metadata.get("section", "")
210
+ context_strings.append(f"[{section}] {content}\n(Source: {source})")
211
+
212
+ return "\n\n".join(context_strings)
213
+
214
+ llm = ChatTogether(
215
+ model="meta-llama/Llama-3-8b-chat-hf",
216
+ temperature=0.3,
217
+ max_tokens=1024,
218
+ top_p=0.7,
219
+ together_api_key="a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6" # Replace before deployment or use os.getenv
220
+ )
221
+
222
+ prompt = ChatPromptTemplate.from_template("""
223
+ You are an expert assistant for ImageOnline Web Solutions.
224
+
225
+ Answer the user's query based ONLY on the following context:
226
+
227
+ {context}
228
+
229
+ Query: {question}
230
+ """)
231
+
232
+ rag_chain = (
233
+ {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
234
+ | prompt
235
+ | llm
236
+ | StrOutputParser()
237
+ )
238
+
239
+ import gradio as gr
240
+
241
+ def chat_interface(message, history):
242
+ history = history or []
243
+
244
+ # Display user message
245
+ history.append(("πŸ§‘ You: " + message, "⏳ Generating response..."))
246
+
247
+ try:
248
+ # Call RAG pipeline
249
+ answer = rag_chain.invoke(message)
250
+
251
+ # Replace placeholder with actual response
252
+ history[-1] = ("πŸ§‘ You: " + message, "πŸ€– Bot: " + answer)
253
+
254
+ except Exception as e:
255
+ error_msg = f"⚠️ Error: {str(e)}"
256
+ history[-1] = ("πŸ§‘ You: " + message, f"πŸ€– Bot: {error_msg}")
257
+
258
+ return history, history
259
+
260
+ def launch_gradio():
261
+ with gr.Blocks() as demo:
262
+ gr.Markdown("# πŸ’¬ ImageOnline RAG Chatbot")
263
+ gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")
264
+
265
+ chatbot = gr.Chatbot()
266
+ state = gr.State([])
267
+
268
+ with gr.Row():
269
+ msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
270
+ send_btn = gr.Button("πŸ“¨ Send", scale=1)
271
+
272
+ msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
273
+ send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
274
+
275
+ with gr.Row():
276
+ clear_btn = gr.Button("🧹 Clear Chat")
277
+ clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])
278
+
279
+ return demo
280
+
281
+ if __name__ == "__main__":
282
+ demo = launch_gradio()
283
+ demo.launch()
requirements.txt ADDED
File without changes