Spaces:

IProject-10
/

IOPL-Chatbot

Sleeping

App Files Files Community

IOPL-Chatbot / app.py

IProject-10

Update app.py

9344624 verified 6 days ago

raw

history blame contribute delete

7.3 kB

	# app.py

	import os
	import uuid
	import nltk
	import trafilatura
	import chromadb
	import tiktoken
	import gradio as gr

	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.runnables import RunnableLambda, RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser
	from langchain_together import ChatTogether
	from langchain_community.vectorstores import Chroma
	from sentence_transformers import SentenceTransformer
	from nltk.tokenize import sent_tokenize
	from langchain_huggingface import HuggingFaceEmbeddings


	# Download NLTK resources
	nltk.download('punkt')
	nltk.download('punkt_tab')

	# Initialize tokenizer
	tokenizer = tiktoken.get_encoding("cl100k_base")

	# Initialize embedding model
	embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
	embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

	# Initialize ChromaDB
	chroma_client = chromadb.PersistentClient(path="./chroma_store")
	collection = chroma_client.get_or_create_collection(name="imageonline_chunks")

	# Sectioned URL List
	url_dict = {
	"Website Designing": [
	"https://www.imageonline.co.in/website-designing-mumbai.html",
	"https://www.imageonline.co.in/domain-hosting-services-india.html",
	"https://www.imageonline.co.in/best-seo-company-mumbai.html",
	"https://www.imageonline.co.in/wordpress-blog-designing-india.html",
	"https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
	"https://www.imageonline.co.in/website-template-customization-india.html",
	"https://www.imageonline.co.in/regular-website-maintanence-services.html",
	"https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
	"https://www.imageonline.co.in/web-application-screen-designing.html"
	],
	"Website Development": [
	"https://www.imageonline.co.in/website-development-mumbai.html",
	"https://www.imageonline.co.in/open-source-customization.html",
	"https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
	"https://www.imageonline.co.in/website-with-content-management-system.html",
	"https://www.imageonline.co.in/web-application-development-india.html"
	],
	"Mobile App Development": [
	"https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
	],
	"About Us": [
	"https://www.imageonline.co.in/about-us.html",
	"https://www.imageonline.co.in/vision.html",
	"https://www.imageonline.co.in/team.html"
	],
	"Testimonials": [
	"https://www.imageonline.co.in/testimonial.html"
	]
	}

	# Helper functions
	def extract_clean_text(url):
	try:
	print(f"🔗 Fetching URL: {url}")
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
	print(f"✅ Extracted text from {url}")
	return content
	else:
	print(f"⚠️ Failed to fetch content from {url}")
	except Exception as e:
	print(f"❌ Error fetching {url}: {e}")
	return None

	def chunk_text(text, max_tokens=400):
	sentences = sent_tokenize(text)
	chunks = []
	current_chunk = []

	for sentence in sentences:
	current_chunk.append(sentence)
	tokens = tokenizer.encode(" ".join(current_chunk))
	if len(tokens) > max_tokens:
	current_chunk.pop()
	chunks.append(" ".join(current_chunk).strip())
	current_chunk = [sentence]

	if current_chunk:
	chunks.append(" ".join(current_chunk).strip())

	print(f"📄 Text split into {len(chunks)} chunks.")
	return chunks

	# Check refresh override
	force_refresh = os.getenv("FORCE_REFRESH", "false").lower() == "true"

	# Load data into ChromaDB
	if collection.count() == 0 or force_refresh:
	print("🔄 Loading documents into ChromaDB...")
	for section, urls in url_dict.items():
	for url in urls:
	text = extract_clean_text(url)
	if not text:
	continue
	chunks = chunk_text(text)
	embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
	metadatas = [{"source": url, "section": section} for _ in chunks]
	ids = [str(uuid.uuid4()) for _ in chunks]

	collection.add(
	documents=chunks,
	embeddings=embeddings.tolist(),
	metadatas=metadatas,
	ids=ids
	)
	print("✅ Document loading complete.")
	else:
	print("✅ Using existing ChromaDB collection.")

	# Vectorstore & Retriever
	vectorstore = Chroma(
	client=chroma_client,
	collection_name="imageonline_chunks",
	embedding_function=embedding_function
	)

	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

	# Together.ai LLM
	llm = ChatTogether(
	model="meta-llama/Llama-3-8b-chat-hf",
	temperature=0.3,
	max_tokens=1024,
	top_p=0.7,
	together_api_key=os.getenv("TOGETHER_API_KEY")
	)

	# Prompt template (refined)
	prompt = ChatPromptTemplate.from_template("""
	You are a helpful assistant for ImageOnline Web Solutions.

	Use ONLY the information provided in the context to answer the user's query.

	Context:
	{context}

	Question:
	{question}

	If the answer is not found in the context, say "I'm sorry, I don't have enough information to answer that."
	""")

	# Context retrieval
	def retrieve_and_format(query):
	docs = retriever.get_relevant_documents(query)
	context_strings = []
	for doc in docs:
	content = doc.page_content
	metadata = doc.metadata
	source = metadata.get("source", "")
	section = metadata.get("section", "")
	context_strings.append(f"[{section}] {content}\n(Source: {source})")
	return "\n\n".join(context_strings)

	# RAG chain
	rag_chain = (
	{"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	# Gradio Interface
	def chat_interface(message, history):
	history = history or []
	history.append(("🧑 You: " + message, "⏳ Generating response..."))
	try:
	answer = rag_chain.invoke(message)
	history[-1] = ("🧑 You: " + message, "🤖 Bot: " + answer)
	except Exception as e:
	error_msg = f"⚠️ Error: {str(e)}"
	history[-1] = ("🧑 You: " + message, f"🤖 Bot: {error_msg}")
	return history, history

	def launch_gradio():
	with gr.Blocks() as demo:
	gr.Markdown("# 💬 ImageOnline RAG Chatbot")
	gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")

	chatbot = gr.Chatbot()
	state = gr.State([])

	with gr.Row():
	msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
	send_btn = gr.Button("📨 Send", scale=1)

	msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
	send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])

	with gr.Row():
	clear_btn = gr.Button("🧹 Clear Chat")
	clear_btn.click(fn=lambda: ([], []), outputs=[chatbot, state])

	return demo

	if __name__ == "__main__":
	demo = launch_gradio()
	demo.launch()