File size: 5,107 Bytes
44a2e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from llama_index.core import (SimpleDirectoryReader,Document, VectorStoreIndex, StorageContext, load_index_from_storage)
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.readers.file import CSVReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.settings import Settings
from llama_index.llms.groq import Groq



load_dotenv()


embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = Groq(
    model="llama-3.1-8b-instant",  
    api_key=os.getenv("GROQ_API_KEY"),
    max_tokens=500,
    temperature=0.1
)


Settings.embed_model = embed_model
Settings.llm = llm


pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_INDEX")

def get_vector_store():
    
    pinecone_index = pc.Index(index_name)
    return PineconeVectorStore(pinecone_index=pinecone_index)

def get_storage_context(for_rebuild=False):
    
    vector_store = get_vector_store()
    persist_dir = "./storage"
    
    if for_rebuild or not os.path.exists(persist_dir):
    
        return StorageContext.from_defaults(vector_store=vector_store)
    else:
    
        return StorageContext.from_defaults(
            vector_store=vector_store,
            persist_dir=persist_dir
        )





def get_and_chunk_documents():

    try:

        file_extractor = {".csv": CSVReader()}


        documents = SimpleDirectoryReader(
            "../knowledge_base", 
            file_extractor=file_extractor
        ).load_data()

        print(f"πŸ“– Loaded {len(documents)} documents")

        node_parser = SemanticSplitterNodeParser(
            buffer_size=1, 
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model
        )

        nodes = node_parser.get_nodes_from_documents(documents)
        print(f"πŸ“„ Created {len(nodes)} document chunks")
        return nodes

    except Exception as e:
        print(f"❌ Error loading documents: {e}")
        return []


def get_index():

    try:
        storage_context = get_storage_context()

        return load_index_from_storage(storage_context)
    except Exception as e:
        print(f"⚠️ Local storage not found, creating index from existing Pinecone data...")
        try:

            vector_store = get_vector_store()
            storage_context = get_storage_context()
            index = VectorStoreIndex.from_vector_store(
                vector_store=vector_store,
                storage_context=storage_context
            )
            return index
        except Exception as e2:
            print(f"❌ Error creating index from vector store: {e2}")
            return None

def check_index_status():

    try:
        pinecone_index = pc.Index(index_name)
        stats = pinecone_index.describe_index_stats()
        vector_count = stats.get('total_vector_count', 0)
        
        if vector_count > 0:
            print(f"βœ… Index found with {vector_count} vectors")
            return True
        else:
            print("❌ Index exists but is empty")
            return False
    except Exception as e:
        print(f"❌ Error checking index: {e}")
        return False
    


def clear_pinecone_index():
    """Delete all vectors from Pinecone index"""
    try:
        pinecone_index = pc.Index(index_name)
        

        stats = pinecone_index.describe_index_stats()
        vector_count = stats.get('total_vector_count', 0)
        print(f"πŸ—‘οΈ Current vectors in index: {vector_count}")
        
        if vector_count > 0:

            pinecone_index.delete(delete_all=True)
            print("βœ… All vectors deleted from Pinecone index")
        else:
            print("ℹ️ Index is already empty")
            
        return True
        
    except Exception as e:
        print(f"❌ Error clearing index: {e}")
        return False

def rebuild_index():
    """Clear old data and rebuild index with new CSV processing"""
    try:
        print("πŸ”„ Starting index rebuild process...")
        

        if not clear_pinecone_index():
            print("❌ Failed to clear index, aborting rebuild")
            return None
        

        import shutil
        if os.path.exists("./storage"):
            shutil.rmtree("./storage")
            print("πŸ—‘οΈ Cleared local storage")
        

        nodes = get_and_chunk_documents()
        
        if not nodes:
            print("❌ No nodes created, cannot rebuild index")
            return None
        

        storage_context = get_storage_context(for_rebuild=True)
        index = VectorStoreIndex(nodes, storage_context=storage_context)
        

        index.storage_context.persist(persist_dir="./storage")
        
        print(f"βœ… Index rebuilt successfully with {len(nodes)} nodes")
        return index
        
    except Exception as e:
        print(f"❌ Error rebuilding index: {e}")
        return None