AIAgentFinal_Assignment / retriever.py
Celine1026's picture
test
75703a7 verified
#build retriever on supabase
#create project, table, indexes, and functions
#create client with url and key
#insert data with embedding
#
# Load metadata.jsonl
import json
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase.client import Client, create_client
from langchain.schema import Document
# Load the metadata.jsonl file
with open('metadata.jsonl', 'r') as jsonl_file:
json_list = list(jsonl_file)
json_QA = []
for json_str in json_list:
json_data = json.loads(json_str)
json_QA.append(json_data)
### build a vector database based on the metadata.jsonl
# https://python.langchain.com/docs/integrations/vectorstores/supabase/
load_dotenv()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # dim=768
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)
# wrap the metadata.jsonl's questions and answers into a list of document
docs = []
for sample in json_QA:
content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
doc = {
"content" : content,
"metadata" : { # meatadata็š„ๆ ผๅผๅฟ…้กปๆ—ถsource้”ฎ๏ผŒๅฆๅˆ™ไผšๆŠฅ้”™
"source" : sample['task_id']
},
"embedding" : embeddings.embed_query(content),
}
docs.append(doc)
table_name = os.environ.get('TABLE_NAME')
# upload the documents to the vector database
try:
response = (
supabase.table("documents")
.insert(docs)
.execute()
)
except Exception as exception:
print("Error inserting data into Supabase:", exception)