In [None]:
import json

def load_data(file_path):
 """Loads data from a JSONL file."""
 data = []
 with open(file_path, 'r', encoding='utf-8') as f:
 for line in f:
 data.append(json.loads(line))
 return data

def search_and_print_task(task_id, data):
 """
 Searches for a task by its ID and prints it in a formatted way.
 """
 found_task = None
 for sample in data:
 if sample['task_id'] == task_id:
 found_task = sample
 break

 if not found_task:
 print(f"Task with ID '{task_id}' not found.")
 return

 print("=" * 50)
 print(f"Task ID: {found_task.get('task_id', 'N/A')}")
 print(f"Question: {found_task.get('Question', 'N/A')}")
 print(f"Level: {found_task.get('Level', 'N/A')}")
 print(f"Final Answer: {found_task.get('Final answer', 'N/A')}")

 metadata = found_task.get('Annotator Metadata', {})
 if metadata:
 print(f"Annotator Metadata: ")
 
 steps = metadata.get('Steps')
 if steps:
 print(f" ├── Steps: ")
 for step in steps.split('\\n'):
 print(f" │ ├── {step}")
 
 num_steps = metadata.get('Number of steps')
 if num_steps is not None:
 print(f" ├── Number of steps: {num_steps}")
 
 duration = metadata.get('How long did this take?')
 if duration:
 print(f" ├── How long did this take?: {duration}")
 
 tools = metadata.get('Tools')
 if tools:
 print(f" ├── Tools:")
 for tool in tools.split('\\n'):
 print(f" │ ├── {tool}")

 num_tools = metadata.get('Number of tools')
 if num_tools is not None:
 print(f" └── Number of tools: {num_tools}")
 
 print("=" * 50)



In [None]:
# 1. Load the data
json_QA = load_data('metadata.jsonl')


In [None]:

# 2. Choose a task_id to search for. 
# I'll use the first one from the file as an example.
example_task_id = "8e867cd7-cff9-4e6c-867a-ff5ddc2550be"

# 3. Call the function with the task_id
search_and_print_task(example_task_id, json_QA)

In [None]:
import json 
with open('metadata.jsonl', 'r') as f: 
 json_list = list(f)

json_QA = []
for json_str in json_list: 
 json_data = json.loads(json_str)
 json_QA.append(json_data)

In [None]:
# import specific question

In [None]:
import random
random_samples = random.sample(json_QA, 2)
for sample in random_samples:
 print("=" * 50)
 print(f"Task ID: {sample['task_id']}")
 print(f"Question: {sample['Question']}")
 print(f"Level: {sample['Level']}")
 print(f"Final Answer: {sample['Final answer']}")
 print(f"Annotator Metadata: ")
 print(f" ├── Steps: ")
 for step in sample['Annotator Metadata']['Steps'].split('\n'):
 print(f" │ ├── {step}")
 print(f" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}")
 print(f" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}")
 print(f" ├── Tools:")
 for tool in sample['Annotator Metadata']['Tools'].split('\n'):
 print(f" │ ├── {tool}")
 print(f" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}")
print("=" * 50)

In [None]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase.client import Client, create_client


load_dotenv()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # dim=768

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

In [None]:
from langchain.schema import Document
docs = []
cnt = 0 
for sample in json_QA:
 content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
 doc = {
 "id" : cnt,
 "content" : content,
 "metadata" : {
 "source" : sample['task_id']
 },
 "embedding" : embeddings.embed_query(content),
 }
 docs.append(doc)
 cnt += 1

# upload the documents to the vector database
try:
 response = (
 supabase.table("documents2")
 .insert(docs)
 .execute()
 )
except Exception as exception:
 print("Error inserting data into Supabase:", exception)

# # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase
# import pandas as pd
# df = pd.DataFrame(docs)
# df.to_csv('supabase_docs.csv',index=False)

In [None]:
# add items to vector database
vector_store = SupabaseVectorStore(
 client=supabase,
 embedding= embeddings,
 table_name="documents2",
 query_name="match_documents_2",
)
retriever = vector_store.as_retriever()

In [None]:
query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"
# matched_docs = vector_store.similarity_search(query, k=2)
docs = retriever.invoke(query)

In [None]:
docs[0]

In [None]:
# list of the tools used in all the samples
from collections import Counter, OrderedDict

tools = []
for sample in json_QA:
 for tool in sample['Annotator Metadata']['Tools'].split('\n'):
 tool = tool[2:].strip().lower()
 if tool.startswith("("):
 tool = tool[11:].strip()
 tools.append(tool)
tools_counter = OrderedDict(Counter(tools))
print("List of tools used in all samples:")
print("Total number of tools used:", len(tools_counter))
for tool, count in tools_counter.items():
 print(f" ├── {tool}: {count}")