Spaces:
Runtime error
Runtime error
File size: 3,802 Bytes
352d09d dd80727 352d09d d216b93 18f84bc 352d09d d216b93 352d09d d216b93 7ac2ceb 7c6f596 d216b93 18f84bc 3f6356f 18f84bc 3f6356f 18f84bc 3f6356f 26e4727 352d09d d216b93 dd80727 d216b93 224354c d216b93 224354c c4739f9 224354c 352d09d 224354c 352d09d 224354c 352d09d bbf555f 352d09d 224354c 352d09d 224354c 352d09d dd80727 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import os
import zipfile
import uuid
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import WhatsAppChatLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
load_dotenv()
# Initialize Pinecone and the index outside the function
pinecone_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_key)
index_name = "whatsapp-chat-index-1"
if 'index_name' not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=384, # change as per embedding model
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
index = pc.Index(index_name)
# Initialize Hugging Face embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Maximum allowed chunk size in bytes (4MB)
MAX_CHUNK_SIZE = 4 * 1024 * 1024
def load_chat_content(file) -> str:
"""Load chat content from the uploaded zip file and store it in Pinecone."""
if file is None:
return "No file uploaded. Please upload a valid ZIP file to process."
# Ensure the uploaded file is a ZIP file
if not zipfile.is_zipfile(file.name):
return "Uploaded file is not a valid ZIP file. Please upload a ZIP file."
# Load and process the ZIP file
temp_dir = 'temp_extracted_files'
os.makedirs(temp_dir, exist_ok=True)
try:
with zipfile.ZipFile(file, 'r') as z:
z.extractall(temp_dir)
except zipfile.BadZipFile:
return "Error reading ZIP file. The file may be corrupted."
chat_files = [f for f in os.listdir(temp_dir) if f.endswith('.txt')]
if not chat_files:
return "No chat files found in the zip archive."
chat_file_path = os.path.join(temp_dir, chat_files[0])
loader = WhatsAppChatLoader(path=chat_file_path)
raw_messages = loader.lazy_load()
messages = list(raw_messages)
chat_content = "\n".join([doc.page_content for doc in messages])
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
chunks = text_splitter.create_documents([chat_content])
# Store chunks in Pinecone with unique IDs
vectors_to_upsert = []
for i, chunk in enumerate(chunks):
vector = embeddings.embed_documents([chunk.page_content])[0]
unique_id = str(uuid.uuid4()) # Generate a unique ID
vectors_to_upsert.append((unique_id, vector, {"text": chunk.page_content}))
# Split the vectors_to_upsert into smaller batches if needed
for i in range(0, len(vectors_to_upsert), 100): # Batch size of 100
batch = vectors_to_upsert[i:i + 100]
# Calculate batch size
batch_size = sum(len(vector[2]["text"].encode('utf-8')) for vector in batch)
if batch_size > MAX_CHUNK_SIZE:
# Further split the batch if it exceeds the limit
for j in range(0, len(batch), 10): # Split into even smaller batches
sub_batch = batch[j:j + 10]
index.upsert(sub_batch)
else:
index.upsert(batch)
return "All chat content has been successfully upserted to Pinecone."
# Define the Gradio interface
interface = gr.Interface(
fn=load_chat_content,
inputs=[
gr.File(label="Upload WhatsApp Chat Zip File")
],
outputs="text",
title="WhatsApp Chat Upsert to Pinecone",
description="Upload a zip file containing a WhatsApp chat file and upsert its content to Pinecone.",
)
if __name__ == "__main__":
interface.launch()
|