File size: 4,047 Bytes
030fde7
 
 
24ee0b7
c90e25b
24ee0b7
c90e25b
24ee0b7
 
 
 
817e0ff
c90e25b
817e0ff
c90e25b
24ee0b7
817e0ff
030fde7
 
b2a7d5a
030fde7
 
 
0a92d7a
030fde7
 
c90e25b
b2a7d5a
140fca2
 
24ee0b7
140fca2
0a92d7a
 
24ee0b7
 
0a92d7a
 
 
24ee0b7
 
c90e25b
0a92d7a
140fca2
24ee0b7
b2a7d5a
24ee0b7
 
140fca2
24ee0b7
140fca2
030fde7
c90e25b
2927b93
 
 
 
 
24ee0b7
 
2927b93
 
 
 
 
 
 
030fde7
 
0a92d7a
030fde7
 
 
817e0ff
030fde7
24ee0b7
030fde7
 
817e0ff
030fde7
 
 
b2a7d5a
24ee0b7
 
030fde7
24ee0b7
817e0ff
24ee0b7
c90e25b
24ee0b7
 
 
 
 
817e0ff
24ee0b7
 
817e0ff
24ee0b7
030fde7
 
 
c90e25b
030fde7
 
817e0ff
 
ff3e9d9
817e0ff
ff3e9d9
030fde7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import re

# --- 1. Load Models ---
print("Loading sentence-transformer model for retrieval...")
retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Retriever model loaded.")

# --- THIS IS THE UPDATED LINE ---
print("Loading generative model for answering (google/flan-t5-base)...")
# Using the balanced 'base' model for better performance and reliability.
generator_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
print("Generative model loaded.")
# --- END OF UPDATE ---


# --- 2. Setup ChromaDB ---
client = chromadb.Client()

try:
    collection = client.create_collection("whatsapp_chat_v2")
    print("ChromaDB collection created.")
    
    # --- Data Loading and Cleaning ---
    try:
        print("Loading data from my_data.txt...")
        with open('my_data.txt', 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]
        
        message_pattern = re.compile(r'^\[?.*?\]?\s*.*?:\s*(.*)')
        
        cleaned_documents = []
        for line in lines:
            match = message_pattern.match(line)
            if match and match.group(1):
                cleaned_documents.append(match.group(1).strip())

        if not cleaned_documents:
            print("ERROR: Could not extract any valid messages from my_data.txt.")
            cleaned_documents = ["Error: The data file 'my_data.txt' could not be processed."]
        else:
            print(f"Successfully loaded and cleaned {len(cleaned_documents)} messages.")

        documents = cleaned_documents
        
    except FileNotFoundError:
        print("Error: my_data.txt not found.")
        documents = ["Error: my_data.txt not found. Please make sure the file is uploaded."]
    
    # --- Batch Processing ---
    batch_size = 5000 
    print("Starting to process and add documents in batches...")
    for i in range(0, len(documents), batch_size):
        end_i = min(i + batch_size, len(documents))
        batch_docs = documents[i:end_i]
        print(f"Processing batch of {len(batch_docs)} documents...")
        batch_embeddings = retriever_model.encode(batch_docs)
        batch_ids = [f"id_{j}" for j in range(i, end_i)]
        collection.add(
            embeddings=batch_embeddings.tolist(),
            documents=batch_docs,
            ids=batch_ids
        )
    print("All documents have been successfully added to ChromaDB.")

except ValueError:
    collection = client.get_collection("whatsapp_chat_v2")
    print("ChromaDB collection loaded.")


# --- 3. Define Chatbot Logic ---
def chatbot_response(message, history):
    query_embedding = retriever_model.encode([message]).tolist()
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=5 # Using 5 results is a good balance for the base model
    )
    retrieved_documents = results['documents'][0]
    
    if not retrieved_documents or "Error:" in retrieved_documents[0]:
        return "I'm sorry, I couldn't find any relevant information in the chat history. 🤔"
    
    context = "\n- ".join(retrieved_documents)
    prompt = f"""
    Based on the following excerpts from a WhatsApp chat, provide a helpful and accurate answer to the user's question.

    Chat Context:
    - {context}

    Question:
    {message}

    Answer:
    """
    
    generated_text = generator_pipe(prompt, max_length=150, num_beams=5, early_stopping=True)
    response = generated_text[0]['generated_text']
    
    return response

# --- 4. Create the Gradio Interface ---
iface = gr.ChatInterface(
    fn=chatbot_response,
    title="WhatsApp Chat Bot ⚡️",
    description="Ask me anything about this WhatsApp chat history. (Powered by flan-t5-base)",
    theme="soft", 
    examples=["What was the final decision on the project deadline?", "Summarize the conversation about the event."],
    cache_examples=False
)

# Launch the app
iface.launch()