Spaces:

IProject-10
/

IOPL-Chatbot-2

Running

App Files Files Community

IProject-10 commited on 6 days ago

Commit

8af136f

verified ·

1 Parent(s): 32dad47

Upload 2 files

Browse files

Files changed (2) hide show

app.py +283 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import nltk
+nltk.download('punkt')
+nltk.download('punkt_tab')
+# SECTIONED URL LIST (in case we want to tag later)
+url_dict = {
+    "Website Designing": [
+        "https://www.imageonline.co.in/website-designing-mumbai.html",
+        "https://www.imageonline.co.in/domain-hosting-services-india.html",
+        "https://www.imageonline.co.in/best-seo-company-mumbai.html",
+        "https://www.imageonline.co.in/wordpress-blog-designing-india.html",
+        "https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
+        "https://www.imageonline.co.in/website-template-customization-india.html",
+        "https://www.imageonline.co.in/regular-website-maintanence-services.html",
+        "https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
+        "https://www.imageonline.co.in/web-application-screen-designing.html"
+    ],
+    "Website Development": [
+        "https://www.imageonline.co.in/website-development-mumbai.html",
+        "https://www.imageonline.co.in/open-source-customization.html",
+        "https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
+        "https://www.imageonline.co.in/website-with-content-management-system.html",
+        "https://www.imageonline.co.in/web-application-development-india.html"
+    ],
+    "Mobile App Development": [
+        "https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
+    ],
+    "About Us": [
+        "https://www.imageonline.co.in/about-us.html",
+        "https://www.imageonline.co.in/vision.html",
+        "https://www.imageonline.co.in/team.html"
+    ],
+    "Testimonials": [
+        "https://www.imageonline.co.in/testimonial.html"
+    ]
+}
+import trafilatura
+import requests
+# Function to extract clean text using trafilatura
+def extract_clean_text(url):
+    """
+    Fetch and extract clean main content from a URL using trafilatura.
+    Returns None if content couldn't be extracted.
+    """
+    try:
+        downloaded = trafilatura.fetch_url(url)
+        if downloaded:
+            content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
+            return content
+    except Exception as e:
+        print(f"Error fetching {url}: {e}")
+    return None
+# Scrape data and prepare for RAG with metadata
+scraped_data = []
+for section, urls in url_dict.items():
+    for url in urls:
+        print(f"🟩 Scraping: {url}")
+        text = extract_clean_text(url)
+        if text:
+            print(f"✅ Extracted {len(text)} characters.\n")
+            scraped_data.append({
+                "content": text,
+                "metadata": {
+                    "source": url,
+                    "section": section
+                }
+            })
+        else:
+            print(f"❌ Failed to extract content from {url}.\n")
+print(f"Total pages scraped: {len(scraped_data)}")
+import tiktoken
+from nltk.tokenize import sent_tokenize
+# Initialize GPT tokenizer (cl100k_base works with Together.ai and OpenAI APIs)
+tokenizer = tiktoken.get_encoding("cl100k_base")
+def chunk_text(text, max_tokens=400):
+    """
+    Chunk text into overlapping segments based on sentence boundaries and token limits.
+    """
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = []
+    for sentence in sentences:
+        current_chunk.append(sentence)
+        tokens = tokenizer.encode(" ".join(current_chunk))
+        if len(tokens) > max_tokens:
+            # Finalize current chunk without last sentence
+            current_chunk.pop()
+            chunks.append(" ".join(current_chunk).strip())
+            current_chunk = [sentence]  # Start new chunk with overflow sentence
+    # Append final chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk).strip())
+    return chunks
+chunked_data = []
+for item in scraped_data:
+    text = item["content"]
+    metadata = item["metadata"]
+    chunks = chunk_text(text, max_tokens=400)
+    for chunk in chunks:
+        chunked_data.append({
+            "content": chunk,
+            "metadata": metadata  # Keep the same URL + section for each chunk
+        })
+# Extract text chunks from chunked_data for embedding
+texts_to_embed = [item["content"] for item in chunked_data]
+from sentence_transformers import SentenceTransformer
+# Load the embedding model
+embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
+def embed_chunks(text_list, model):
+    """
+    Generate embeddings for a list of text chunks.
+    """
+    return model.encode(text_list, convert_to_numpy=True)
+# Generate embeddings
+embeddings = embed_chunks(texts_to_embed, embedding_model)
+print(f"✅ Generated {len(embeddings)} embeddings")
+print(f"🔹 Shape of first embedding: {embeddings[0].shape}")
+import chromadb
+import uuid
+# Initialize ChromaDB client (persistent storage)
+chroma_client = chromadb.PersistentClient(path="./chroma_store")
+# Create or get collection
+collection = chroma_client.get_or_create_collection(name="imageonline_chunks")
+# Extract documents, embeddings, metadatas
+documents = [item["content"] for item in chunked_data]
+metadatas = [item["metadata"] for item in chunked_data]
+ids = [str(uuid.uuid4()) for _ in documents]
+# Safety check
+assert len(documents) == len(embeddings) == len(metadatas), "Data length mismatch!"
+# Add to ChromaDB
+collection.add(
+    documents=documents,
+    embeddings=embeddings.tolist(),
+    metadatas=metadatas,
+    ids=ids
+)
+# Sample query
+query = "web design company"
+query_embedding = embedding_model.encode([query])[0]
+# Query ChromaDB
+results = collection.query(
+    query_embeddings=[query_embedding.tolist()],
+    n_results=3
+)
+# Display results
+for i in range(len(results['documents'][0])):
+    print(f"\n🔍 Match {i+1}:")
+    print(f"Content: {results['documents'][0][i][:200]}...")
+    print(f"📎 Metadata: {results['metadatas'][0][i]}")
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableLambda, RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_together import ChatTogether
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+# Initialize vectorstore
+embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+vectorstore = Chroma(
+    client=chroma_client,  # from your previous chroma setup
+    collection_name="imageonline_chunks",
+    embedding_function=embedding_function
+)
+# Create retriever
+retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
+def retrieve_and_format(query):
+    docs = retriever.get_relevant_documents(query)
+    context_strings = []
+    for doc in docs:
+        content = doc.page_content
+        metadata = doc.metadata
+        source = metadata.get("source", "")
+        section = metadata.get("section", "")
+        context_strings.append(f"[{section}] {content}\n(Source: {source})")
+    return "\n\n".join(context_strings)
+llm = ChatTogether(
+    model="meta-llama/Llama-3-8b-chat-hf",
+    temperature=0.3,
+    max_tokens=1024,
+    top_p=0.7,
+    together_api_key="a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6"  # Replace before deployment or use os.getenv
+)
+prompt = ChatPromptTemplate.from_template("""
+You are an expert assistant for ImageOnline Web Solutions.
+Answer the user's query based ONLY on the following context:
+{context}
+Query: {question}
+""")
+rag_chain = (
+    {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+import gradio as gr
+def chat_interface(message, history):
+    history = history or []
+    # Display user message
+    history.append(("🧑 You: " + message, "⏳ Generating response..."))
+    try:
+        # Call RAG pipeline
+        answer = rag_chain.invoke(message)
+        # Replace placeholder with actual response
+        history[-1] = ("🧑 You: " + message, "🤖 Bot: " + answer)
+    except Exception as e:
+        error_msg = f"⚠️ Error: {str(e)}"
+        history[-1] = ("🧑 You: " + message, f"🤖 Bot: {error_msg}")
+    return history, history
+def launch_gradio():
+    with gr.Blocks() as demo:
+        gr.Markdown("# 💬 ImageOnline RAG Chatbot")
+        gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")
+        chatbot = gr.Chatbot()
+        state = gr.State([])
+        with gr.Row():
+            msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
+            send_btn = gr.Button("📨 Send", scale=1)
+        msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
+        send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
+        with gr.Row():
+            clear_btn = gr.Button("🧹 Clear Chat")
+            clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])
+    return demo
+if __name__ == "__main__":
+    demo = launch_gradio()
+    demo.launch()

requirements.txt ADDED Viewed

File without changes