Spaces:
Sleeping
Sleeping
import os | |
import json | |
import chromadb | |
import pandas as pd | |
from llama_index.core import VectorStoreIndex | |
from llama_index.vector_stores.chroma import ChromaVectorStore | |
from llama_index.core import StorageContext | |
from llama_index.core import Document | |
from dotenv import load_dotenv | |
load_dotenv() # Load OPENAI_API_KEY from .env (not included in repo) | |
import gdown | |
data = None | |
def get_data(download=False): | |
global data | |
if data is None: | |
data = Data(download) | |
return data | |
class Data: | |
def __init__(self, download=False): | |
print("Initializing Data...") | |
print(f"Download: {download}") | |
self.client = None | |
self.collection = None | |
self.index = None | |
if download: | |
self.download_data() | |
self.load_data() | |
def download_data(self): | |
# Download the already indexed data | |
if not os.path.exists("./chroma_db"): | |
try: | |
print("Downloading data...") | |
file_id = "1JvYQ9E5zDBKRCUKkxejDvp7UGwzxDAUW" | |
url = f"https://drive.google.com/uc?export=download&id={file_id}" | |
output = "chroma_db.zip" | |
gdown.download(url, output, quiet=False) | |
print("Unzipping data...") | |
os.system("unzip chroma_db.zip") | |
except Exception as e: | |
print(f"Error downloading data: {e}") | |
return os.path.exists("./chroma_db") | |
def load_data(self): | |
print("Loading data...") | |
with open('data/train-v1.1.json', 'r') as f: | |
raw_data = json.load(f) | |
raw_documents = [] | |
documents = [] | |
for data in raw_data['data']: | |
title = data['title'] | |
for par in data['paragraphs']: | |
context = par['context'] | |
for qa in par['qas']: | |
question = qa['question'] | |
answers = [] | |
for ans in qa['answers']: | |
if ans['text'] not in answers: | |
answers.append(ans['text']) | |
for answer in answers: | |
raw_documents.append([title, context, question, answer]) | |
doc = f""" | |
Title: {title} | |
Context: {context} | |
Question: {question} | |
Acceptable Answers: | |
{[f"{i+1}. {ans}" for i, ans in enumerate(answers)]} | |
""" | |
# Remove padding on each line | |
doc = "\n".join([line.strip() for line in doc.split("\n")]) | |
documents.append(doc) | |
self.df = pd.DataFrame(raw_documents, columns=["Title", "Context", "Question", "Answer"]) | |
self.documents = [Document(text=t) for t in documents] | |
print("Raw Data loaded") | |
if not os.path.exists("./chroma_db"): | |
# Attempt to generate an index from the raw data | |
print("Creating Chroma DB...") | |
# initialize client, setting path to save data | |
self.client = chromadb.PersistentClient(path="./chroma_db") | |
# create collection | |
self.collection = self.client.get_or_create_collection("simple_index") | |
# assign chroma as the vector_store to the context | |
vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
# create your index | |
self.index = VectorStoreIndex.from_documents( | |
self.documents, storage_context=storage_context | |
) | |
print("Chroma DB created") | |
else: | |
print("Chroma DB already exists") | |
print("Loading index...") | |
# initialize client | |
self.client = chromadb.PersistentClient(path="./chroma_db") | |
# get collection | |
self.collection = self.client.get_or_create_collection("simple_index") | |
# assign chroma as the vector_store to the context | |
vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
# load your index from stored vectors | |
self.index = VectorStoreIndex.from_vector_store( | |
vector_store, storage_context=storage_context | |
) | |
print("Index loaded") |