Spaces:
Sleeping
Sleeping
import os | |
from dotenv import load_dotenv | |
from pinecone import Pinecone, ServerlessSpec | |
from llama_index.core import (SimpleDirectoryReader,Document, VectorStoreIndex, StorageContext, load_index_from_storage) | |
from llama_index.core.node_parser import SemanticSplitterNodeParser | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.readers.file import CSVReader | |
from llama_index.vector_stores.pinecone import PineconeVectorStore | |
from llama_index.core.settings import Settings | |
from llama_index.llms.groq import Groq | |
load_dotenv() | |
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
llm = Groq( | |
model="llama-3.1-8b-instant", | |
api_key=os.getenv("GROQ_API_KEY"), | |
max_tokens=500, | |
temperature=0.1 | |
) | |
Settings.embed_model = embed_model | |
Settings.llm = llm | |
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) | |
index_name = os.getenv("PINECONE_INDEX") | |
def get_vector_store(): | |
pinecone_index = pc.Index(index_name) | |
return PineconeVectorStore(pinecone_index=pinecone_index) | |
def get_storage_context(for_rebuild=False): | |
vector_store = get_vector_store() | |
persist_dir = "./storage" | |
if for_rebuild or not os.path.exists(persist_dir): | |
return StorageContext.from_defaults(vector_store=vector_store) | |
else: | |
return StorageContext.from_defaults( | |
vector_store=vector_store, | |
persist_dir=persist_dir | |
) | |
def get_and_chunk_documents(): | |
try: | |
file_extractor = {".csv": CSVReader()} | |
documents = SimpleDirectoryReader( | |
"../knowledge_base", | |
file_extractor=file_extractor | |
).load_data() | |
print(f"π Loaded {len(documents)} documents") | |
node_parser = SemanticSplitterNodeParser( | |
buffer_size=1, | |
breakpoint_percentile_threshold=95, | |
embed_model=embed_model | |
) | |
nodes = node_parser.get_nodes_from_documents(documents) | |
print(f"π Created {len(nodes)} document chunks") | |
return nodes | |
except Exception as e: | |
print(f"β Error loading documents: {e}") | |
return [] | |
def get_index(): | |
try: | |
storage_context = get_storage_context() | |
return load_index_from_storage(storage_context) | |
except Exception as e: | |
print(f"β οΈ Local storage not found, creating index from existing Pinecone data...") | |
try: | |
vector_store = get_vector_store() | |
storage_context = get_storage_context() | |
index = VectorStoreIndex.from_vector_store( | |
vector_store=vector_store, | |
storage_context=storage_context | |
) | |
return index | |
except Exception as e2: | |
print(f"β Error creating index from vector store: {e2}") | |
return None | |
def check_index_status(): | |
try: | |
pinecone_index = pc.Index(index_name) | |
stats = pinecone_index.describe_index_stats() | |
vector_count = stats.get('total_vector_count', 0) | |
if vector_count > 0: | |
print(f"β Index found with {vector_count} vectors") | |
return True | |
else: | |
print("β Index exists but is empty") | |
return False | |
except Exception as e: | |
print(f"β Error checking index: {e}") | |
return False | |
def clear_pinecone_index(): | |
"""Delete all vectors from Pinecone index""" | |
try: | |
pinecone_index = pc.Index(index_name) | |
stats = pinecone_index.describe_index_stats() | |
vector_count = stats.get('total_vector_count', 0) | |
print(f"ποΈ Current vectors in index: {vector_count}") | |
if vector_count > 0: | |
pinecone_index.delete(delete_all=True) | |
print("β All vectors deleted from Pinecone index") | |
else: | |
print("βΉοΈ Index is already empty") | |
return True | |
except Exception as e: | |
print(f"β Error clearing index: {e}") | |
return False | |
def rebuild_index(): | |
"""Clear old data and rebuild index with new CSV processing""" | |
try: | |
print("π Starting index rebuild process...") | |
if not clear_pinecone_index(): | |
print("β Failed to clear index, aborting rebuild") | |
return None | |
import shutil | |
if os.path.exists("./storage"): | |
shutil.rmtree("./storage") | |
print("ποΈ Cleared local storage") | |
nodes = get_and_chunk_documents() | |
if not nodes: | |
print("β No nodes created, cannot rebuild index") | |
return None | |
storage_context = get_storage_context(for_rebuild=True) | |
index = VectorStoreIndex(nodes, storage_context=storage_context) | |
index.storage_context.persist(persist_dir="./storage") | |
print(f"β Index rebuilt successfully with {len(nodes)} nodes") | |
return index | |
except Exception as e: | |
print(f"β Error rebuilding index: {e}") | |
return None |