Spaces:

Rohitface
/

Charbot

Sleeping

App Files Files Community

Charbot / app.py

Rohitface

Update app.py

817e0ff verified 17 days ago

raw

history blame contribute delete

4.05 kB

	import gradio as gr
	import chromadb
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	import re

	# --- 1. Load Models ---
	print("Loading sentence-transformer model for retrieval...")
	retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
	print("Retriever model loaded.")

	# --- THIS IS THE UPDATED LINE ---
	print("Loading generative model for answering (google/flan-t5-base)...")
	# Using the balanced 'base' model for better performance and reliability.
	generator_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
	print("Generative model loaded.")
	# --- END OF UPDATE ---


	# --- 2. Setup ChromaDB ---
	client = chromadb.Client()

	try:
	collection = client.create_collection("whatsapp_chat_v2")
	print("ChromaDB collection created.")

	# --- Data Loading and Cleaning ---
	try:
	print("Loading data from my_data.txt...")
	with open('my_data.txt', 'r', encoding='utf-8') as f:
	lines = [line.strip() for line in f if line.strip()]

	message_pattern = re.compile(r'^\[?.?\]?\s.?:\s(.*)')

	cleaned_documents = []
	for line in lines:
	match = message_pattern.match(line)
	if match and match.group(1):
	cleaned_documents.append(match.group(1).strip())

	if not cleaned_documents:
	print("ERROR: Could not extract any valid messages from my_data.txt.")
	cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
	else:
	print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")

	documents = cleaned_documents

	except FileNotFoundError:
	print("Error: my_data.txt not found.")
	documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]

	# --- Batch Processing ---
	batch_size = 5000
	print("Starting to process and add documents in batches...")
	for i in range(0, len(documents), batch_size):
	end_i = min(i + batch_size, len(documents))
	batch_docs = documents[i:end_i]
	print(f"Processing batch of {len(batch_docs)} documents...")
	batch_embeddings = retriever_model.encode(batch_docs)
	batch_ids = [f"id_{j}" for j in range(i, end_i)]
	collection.add(
	embeddings=batch_embeddings.tolist(),
	documents=batch_docs,
	ids=batch_ids
	)
	print("All documents have been successfully added to ChromaDB.")

	except ValueError:
	collection = client.get_collection("whatsapp_chat_v2")
	print("ChromaDB collection loaded.")


	# --- 3. Define Chatbot Logic ---
	def chatbot_response(message, history):
	query_embedding = retriever_model.encode([message]).tolist()
	results = collection.query(
	query_embeddings=query_embedding,
	n_results=5 # Using 5 results is a good balance for the base model
	)
	retrieved_documents = results['documents'][0]

	if not retrieved_documents or "Error:" in retrieved_documents[0]:
	return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"

	context = "\n- ".join(retrieved_documents)
	prompt = f"""
	Based on the following excerpts from a WhatsApp chat, provide a helpful and accurate answer to the user's question.

	Chat Context:
	- {context}

	Question:
	{message}

	Answer:
	"""

	generated_text = generator_pipe(prompt, max_length=150, num_beams=5, early_stopping=True)
	response = generated_text[0]['generated_text']

	return response

	# --- 4. Create the Gradio Interface ---
	iface = gr.ChatInterface(
	fn=chatbot_response,
	title="WhatsApp Chat Bot ⚡️",
	description="Ask me anything about this WhatsApp chat history. (Powered by flan-t5-base)",
	theme="soft",
	examples=["What was the final decision on the project deadline?", "Summarize the conversation about the event."],
	cache_examples=False
	)

	# Launch the app
	iface.launch()