{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def load_data(file_path):\n",
    "    \"\"\"Loads data from a JSONL file.\"\"\"\n",
    "    data = []\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            data.append(json.loads(line))\n",
    "    return data\n",
    "\n",
    "def search_and_print_task(task_id, data):\n",
    "    \"\"\"\n",
    "    Searches for a task by its ID and prints it in a formatted way.\n",
    "    \"\"\"\n",
    "    found_task = None\n",
    "    for sample in data:\n",
    "        if sample['task_id'] == task_id:\n",
    "            found_task = sample\n",
    "            break\n",
    "\n",
    "    if not found_task:\n",
    "        print(f\"Task with ID '{task_id}' not found.\")\n",
    "        return\n",
    "\n",
    "    print(\"=\" * 50)\n",
    "    print(f\"Task ID: {found_task.get('task_id', 'N/A')}\")\n",
    "    print(f\"Question: {found_task.get('Question', 'N/A')}\")\n",
    "    print(f\"Level: {found_task.get('Level', 'N/A')}\")\n",
    "    print(f\"Final Answer: {found_task.get('Final answer', 'N/A')}\")\n",
    "\n",
    "    metadata = found_task.get('Annotator Metadata', {})\n",
    "    if metadata:\n",
    "        print(f\"Annotator Metadata: \")\n",
    "        \n",
    "        steps = metadata.get('Steps')\n",
    "        if steps:\n",
    "            print(f\"  ├── Steps: \")\n",
    "            for step in steps.split('\\\\n'):\n",
    "                print(f\"  │      ├── {step}\")\n",
    "        \n",
    "        num_steps = metadata.get('Number of steps')\n",
    "        if num_steps is not None:\n",
    "            print(f\"  ├── Number of steps: {num_steps}\")\n",
    "            \n",
    "        duration = metadata.get('How long did this take?')\n",
    "        if duration:\n",
    "            print(f\"  ├── How long did this take?: {duration}\")\n",
    "            \n",
    "        tools = metadata.get('Tools')\n",
    "        if tools:\n",
    "            print(f\"  ├── Tools:\")\n",
    "            for tool in tools.split('\\\\n'):\n",
    "                print(f\"  │      ├── {tool}\")\n",
    "\n",
    "        num_tools = metadata.get('Number of tools')\n",
    "        if num_tools is not None:\n",
    "            print(f\"  └── Number of tools: {num_tools}\")\n",
    "            \n",
    "    print(\"=\" * 50)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Load the data\n",
    "json_QA = load_data('metadata.jsonl')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# 2. Choose a task_id to search for. \n",
    "# I'll use the first one from the file as an example.\n",
    "example_task_id = \"8e867cd7-cff9-4e6c-867a-ff5ddc2550be\"\n",
    "\n",
    "# 3. Call the function with the task_id\n",
    "search_and_print_task(example_task_id, json_QA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "with open('metadata.jsonl', 'r') as f: \n",
    "    json_list = list(f)\n",
    "\n",
    "json_QA = []\n",
    "for json_str in json_list: \n",
    "    json_data = json.loads(json_str)\n",
    "    json_QA.append(json_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import specific question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "random_samples = random.sample(json_QA, 2)\n",
    "for sample in random_samples:\n",
    "    print(\"=\" * 50)\n",
    "    print(f\"Task ID: {sample['task_id']}\")\n",
    "    print(f\"Question: {sample['Question']}\")\n",
    "    print(f\"Level: {sample['Level']}\")\n",
    "    print(f\"Final Answer: {sample['Final answer']}\")\n",
    "    print(f\"Annotator Metadata: \")\n",
    "    print(f\"  ├── Steps: \")\n",
    "    for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n",
    "        print(f\"  │      ├── {step}\")\n",
    "    print(f\"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n",
    "    print(f\"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n",
    "    print(f\"  ├── Tools:\")\n",
    "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
    "        print(f\"  │      ├── {tool}\")\n",
    "    print(f\"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n",
    "print(\"=\" * 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "from langchain_community.vectorstores import SupabaseVectorStore\n",
    "from supabase.client import Client, create_client\n",
    "\n",
    "\n",
    "load_dotenv()\n",
    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") #  dim=768\n",
    "\n",
    "supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
    "supabase_key = os.environ.get(\"SUPABASE_SERVICE_ROLE_KEY\")\n",
    "supabase: Client = create_client(supabase_url, supabase_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.schema import Document\n",
    "docs = []\n",
    "cnt = 0 \n",
    "for sample in json_QA:\n",
    "    content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n",
    "    doc = {\n",
    "        \"id\" : cnt,\n",
    "        \"content\" : content,\n",
    "        \"metadata\" : {\n",
    "            \"source\" : sample['task_id']\n",
    "        },\n",
    "        \"embedding\" : embeddings.embed_query(content),\n",
    "    }\n",
    "    docs.append(doc)\n",
    "    cnt += 1\n",
    "\n",
    "# upload the documents to the vector database\n",
    "try:\n",
    "    response = (\n",
    "        supabase.table(\"documents2\")\n",
    "        .insert(docs)\n",
    "        .execute()\n",
    "    )\n",
    "except Exception as exception:\n",
    "    print(\"Error inserting data into Supabase:\", exception)\n",
    "\n",
    "# # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase\n",
    "# import pandas as pd\n",
    "# df = pd.DataFrame(docs)\n",
    "# df.to_csv('supabase_docs.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add items to vector database\n",
    "vector_store = SupabaseVectorStore(\n",
    "    client=supabase,\n",
    "    embedding= embeddings,\n",
    "    table_name=\"documents2\",\n",
    "    query_name=\"match_documents_2\",\n",
    ")\n",
    "retriever = vector_store.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
    "# matched_docs = vector_store.similarity_search(query, k=2)\n",
    "docs = retriever.invoke(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {},
   "outputs": [],
   "source": [
    "# list of the tools used in all the samples\n",
    "from collections import Counter, OrderedDict\n",
    "\n",
    "tools = []\n",
    "for sample in json_QA:\n",
    "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
    "        tool = tool[2:].strip().lower()\n",
    "        if tool.startswith(\"(\"):\n",
    "            tool = tool[11:].strip()\n",
    "        tools.append(tool)\n",
    "tools_counter = OrderedDict(Counter(tools))\n",
    "print(\"List of tools used in all samples:\")\n",
    "print(\"Total number of tools used:\", len(tools_counter))\n",
    "for tool, count in tools_counter.items():\n",
    "    print(f\"  ├── {tool}: {count}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}