Spaces:
Sleeping
Sleeping
import os | |
import json | |
import chromadb | |
from llama_index.core import VectorStoreIndex | |
from llama_index.vector_stores.chroma import ChromaVectorStore | |
from llama_index.core import StorageContext | |
from llama_index.core import Document | |
from dotenv import load_dotenv | |
load_dotenv() # Load OPENAI_API_KEY from .env (not included in repo) | |
import gdown | |
class Data: | |
def __init__(self): | |
self.client = None | |
self.collection = None | |
self.index = None | |
self.download_data() | |
self.load_data() | |
def download_data(self): | |
# Download the already indexed data | |
if not os.path.exists("./chroma_db"): | |
try: | |
print("Downloading data...") | |
file_id = "12xLx8J0dhtZuc8G-7xVyldLVnB3eTmxe" | |
url = f"https://drive.google.com/uc?export=download&id={file_id}" | |
output = "chroma_db.zip" | |
gdown.download(url, output, quiet=False) | |
print("Unzipping data...") | |
os.system("unzip chroma_db.zip") | |
except Exception as e: | |
print(f"Error downloading data: {e}") | |
return os.path.exists("./chroma_db") | |
def load_data(self): | |
print("Loading data...") | |
if not os.path.exists("./chroma_db"): | |
# Attempt to generate an index from the raw data | |
with open('data/train-v1.1.json', 'r') as f: | |
raw_data = json.load(f) | |
extracted_question = [] | |
extracted_answer = [] | |
for data in raw_data['data']: | |
for par in data['paragraphs']: | |
for qa in par['qas']: | |
for ans in qa['answers']: | |
extracted_question.append(qa['question']) | |
extracted_answer.append(ans['text']) | |
documents = [] | |
for i in range(len(extracted_question)): | |
documents.append(f"Question: {extracted_question[i]} \nAnswer: {extracted_answer[i]}") | |
self.documents = [Document(text=t) for t in documents] | |
self.extracted_question = extracted_question | |
self.extracted_answer = extracted_answer | |
print("Raw Data loaded") | |
print("Creating Chroma DB...") | |
# initialize client, setting path to save data | |
self.client = chromadb.PersistentClient(path="./chroma_db") | |
# create collection | |
self.collection = self.client.get_or_create_collection("simple_index") | |
# assign chroma as the vector_store to the context | |
vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
# create your index | |
self.index = VectorStoreIndex.from_documents( | |
self.documents, storage_context=storage_context | |
) | |
print("Chroma DB created") | |
else: | |
print("Chroma DB already exists") | |
print("Loading index...") | |
# initialize client | |
self.client = chromadb.PersistentClient(path="./chroma_db") | |
# get collection | |
self.collection = self.client.get_or_create_collection("simple_index") | |
# assign chroma as the vector_store to the context | |
vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
# load your index from stored vectors | |
self.index = VectorStoreIndex.from_vector_store( | |
vector_store, storage_context=storage_context | |
) | |
print("Index loaded") |