Charbot / app.py
Rohitface's picture
Update app.py
817e0ff verified
import gradio as gr
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import re
# --- 1. Load Models ---
print("Loading sentence-transformer model for retrieval...")
retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Retriever model loaded.")
# --- THIS IS THE UPDATED LINE ---
print("Loading generative model for answering (google/flan-t5-base)...")
# Using the balanced 'base' model for better performance and reliability.
generator_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
print("Generative model loaded.")
# --- END OF UPDATE ---
# --- 2. Setup ChromaDB ---
client = chromadb.Client()
try:
collection = client.create_collection("whatsapp_chat_v2")
print("ChromaDB collection created.")
# --- Data Loading and Cleaning ---
try:
print("Loading data from my_data.txt...")
with open('my_data.txt', 'r', encoding='utf-8') as f:
lines = [line.strip() for line in f if line.strip()]
message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
cleaned_documents = []
for line in lines:
match = message_pattern.match(line)
if match and match.group(1):
cleaned_documents.append(match.group(1).strip())
if not cleaned_documents:
print("ERROR: Could not extract any valid messages from my_data.txt.")
cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
else:
print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")
documents = cleaned_documents
except FileNotFoundError:
print("Error: my_data.txt not found.")
documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]
# --- Batch Processing ---
batch_size = 5000
print("Starting to process and add documents in batches...")
for i in range(0, len(documents), batch_size):
end_i = min(i + batch_size, len(documents))
batch_docs = documents[i:end_i]
print(f"Processing batch of {len(batch_docs)} documents...")
batch_embeddings = retriever_model.encode(batch_docs)
batch_ids = [f"id_{j}" for j in range(i, end_i)]
collection.add(
embeddings=batch_embeddings.tolist(),
documents=batch_docs,
ids=batch_ids
)
print("All documents have been successfully added to ChromaDB.")
except ValueError:
collection = client.get_collection("whatsapp_chat_v2")
print("ChromaDB collection loaded.")
# --- 3. Define Chatbot Logic ---
def chatbot_response(message, history):
query_embedding = retriever_model.encode([message]).tolist()
results = collection.query(
query_embeddings=query_embedding,
n_results=5 # Using 5 results is a good balance for the base model
)
retrieved_documents = results['documents'][0]
if not retrieved_documents or "Error:" in retrieved_documents[0]:
return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"
context = "\n- ".join(retrieved_documents)
prompt = f"""
Based on the following excerpts from a WhatsApp chat, provide a helpful and accurate answer to the user's question.
Chat Context:
- {context}
Question:
{message}
Answer:
"""
generated_text = generator_pipe(prompt, max_length=150, num_beams=5, early_stopping=True)
response = generated_text[0]['generated_text']
return response
# --- 4. Create the Gradio Interface ---
iface = gr.ChatInterface(
fn=chatbot_response,
title="WhatsApp Chat Bot ⚡️",
description="Ask me anything about this WhatsApp chat history. (Powered by flan-t5-base)",
theme="soft",
examples=["What was the final decision on the project deadline?", "Summarize the conversation about the event."],
cache_examples=False
)
# Launch the app
iface.launch()