{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a600d7fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "with open('metadata.jsonl', 'r') as f: \n",
    "    json_list = list(f)\n",
    "\n",
    "json_QA = []\n",
    "for json_str in json_list: \n",
    "    json_data = json.loads(json_str)\n",
    "    json_QA.append(json_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fa5d8eb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================================================\n",
      "Task ID: 853c8244-429e-46ca-89f2-addf40dfb2bd\n",
      "Question: In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the \"twelve animals of the Chinese zodiac\" have a hand visible?\n",
      "Level: 2\n",
      "Final Answer: 11\n",
      "Annotator Metadata: \n",
      "  ├── Steps: \n",
      "  │      ├── 1. Search \"2015 Chinese zodiac animal\" on Google search.\n",
      "  │      ├── 2. Note the animal (ram).\n",
      "  │      ├── 3. Search \"Metropolitan Museum of Art\" on Google search.\n",
      "  │      ├── 4. Open the Metropolitan Museum of Art website.\n",
      "  │      ├── 5. Click \"Exhibitions\" under \"Exhibitions and Events\" \n",
      "  │      ├── 6. Click \"Past\".\n",
      "  │      ├── 7. Set the year to 2015.\n",
      "  │      ├── 8. Scroll to find the exhibit mentioning rams and click \"Celebration of the Year of the Ram\".\n",
      "  │      ├── 9. Click \"View All Objects\".\n",
      "  │      ├── 10. Click \"Twelve animals of the Chinese zodiac\" to open the image.\n",
      "  │      ├── 11. Count how many have a visible hand.\n",
      "  ├── Number of steps: 11\n",
      "  ├── How long did this take?: 10 minutes\n",
      "  ├── Tools:\n",
      "  │      ├── 1. Web browser\n",
      "  │      ├── 2. Search engine\n",
      "  │      ├── 3. Image recognition tools\n",
      "  └── Number of tools: 3\n",
      "==================================================\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "random_samples = random.sample(json_QA, 1)\n",
    "for sample in random_samples:\n",
    "    print(\"=\" * 50)\n",
    "    print(f\"Task ID: {sample['task_id']}\")\n",
    "    print(f\"Question: {sample['Question']}\")\n",
    "    print(f\"Level: {sample['Level']}\")\n",
    "    print(f\"Final Answer: {sample['Final answer']}\")\n",
    "    print(f\"Annotator Metadata: \")\n",
    "    print(f\"  ├── Steps: \")\n",
    "    for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n",
    "        print(f\"  │      ├── {step}\")\n",
    "    print(f\"  ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n",
    "    print(f\"  ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n",
    "    print(f\"  ├── Tools:\")\n",
    "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
    "        print(f\"  │      ├── {tool}\")\n",
    "    print(f\"  └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n",
    "print(\"=\" * 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "05076516",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\franc\\repos\\gaia-agent\\gaia-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "c:\\Users\\franc\\repos\\gaia-agent\\gaia-agent\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\franc\\.cache\\huggingface\\hub\\models--sentence-transformers--all-mpnet-base-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n",
      "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "from langchain_community.vectorstores import SupabaseVectorStore\n",
    "from supabase.client import Client, create_client\n",
    "\n",
    "\n",
    "load_dotenv()\n",
    "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") #  dim=768\n",
    "\n",
    "supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
    "supabase_key = os.environ.get(\"SUPABASE_SERVICE_ROLE_KEY\")\n",
    "supabase: Client = create_client(supabase_url, supabase_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "aa1402e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.schema import Document\n",
    "docs = []\n",
    "cnt = 0 \n",
    "for sample in json_QA:\n",
    "    content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n",
    "    doc = {\n",
    "        \"id\" : cnt,\n",
    "        \"content\" : content,\n",
    "        \"metadata\" : {\n",
    "            \"source\" : sample['task_id']\n",
    "        },\n",
    "        \"embedding\" : embeddings.embed_query(content),\n",
    "    }\n",
    "    docs.append(doc)\n",
    "    cnt += 1\n",
    "\n",
    "# upload the documents to the vector database\n",
    "try:\n",
    "    response = (\n",
    "        supabase.table(\"documents\")\n",
    "        .insert(docs)\n",
    "        .execute()\n",
    "    )\n",
    "except Exception as exception:\n",
    "    print(\"Error inserting data into Supabase:\", exception)\n",
    "\n",
    "# # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase\n",
    "# import pandas as pd\n",
    "# df = pd.DataFrame(docs)\n",
    "# df.to_csv('supabase_docs.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9aa7eb5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add items to vector database\n",
    "vector_store = SupabaseVectorStore(\n",
    "    client=supabase,\n",
    "    embedding= embeddings,\n",
    "    table_name=\"documents\",\n",
    "    query_name=\"match_documents_langchain\",\n",
    ")\n",
    "retriever = vector_store.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9eecafd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
    "# matched_docs = vector_store.similarity_search(query, k=2)\n",
    "docs = retriever.invoke(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "ff917840",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer : 80GSFC21M0002')"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "01c8f337",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List of tools used in all samples:\n",
      "Total number of tools used: 83\n",
      "  ├── web browser: 107\n",
      "  ├── image recognition tools (to identify and parse a figure with three axes): 1\n",
      "  ├── search engine: 101\n",
      "  ├── calculator: 34\n",
      "  ├── unlambda compiler (optional): 1\n",
      "  ├── a web browser.: 2\n",
      "  ├── a search engine.: 2\n",
      "  ├── a calculator.: 1\n",
      "  ├── microsoft excel: 5\n",
      "  ├── google search: 1\n",
      "  ├── ne: 9\n",
      "  ├── pdf access: 7\n",
      "  ├── file handling: 2\n",
      "  ├── python: 3\n",
      "  ├── image recognition tools: 12\n",
      "  ├── jsonld file access: 1\n",
      "  ├── video parsing: 1\n",
      "  ├── python compiler: 1\n",
      "  ├── video recognition tools: 3\n",
      "  ├── pdf viewer: 7\n",
      "  ├── microsoft excel / google sheets: 3\n",
      "  ├── word document access: 1\n",
      "  ├── tool to extract text from images: 1\n",
      "  ├── a word reversal tool / script: 1\n",
      "  ├── counter: 1\n",
      "  ├── excel: 3\n",
      "  ├── image recognition: 5\n",
      "  ├── color recognition: 3\n",
      "  ├── excel file access: 3\n",
      "  ├── xml file access: 1\n",
      "  ├── access to the internet archive, web.archive.org: 1\n",
      "  ├── text processing/diff tool: 1\n",
      "  ├── gif parsing tools: 1\n",
      "  ├── a web browser: 7\n",
      "  ├── a search engine: 7\n",
      "  ├── a speech-to-text tool: 2\n",
      "  ├── code/data analysis tools: 1\n",
      "  ├── audio capability: 2\n",
      "  ├── pdf reader: 1\n",
      "  ├── markdown: 1\n",
      "  ├── a calculator: 5\n",
      "  ├── access to wikipedia: 3\n",
      "  ├── image recognition/ocr: 3\n",
      "  ├── google translate access: 1\n",
      "  ├── ocr: 4\n",
      "  ├── bass note data: 1\n",
      "  ├── text editor: 1\n",
      "  ├── xlsx file access: 1\n",
      "  ├── powerpoint viewer: 1\n",
      "  ├── csv file access: 1\n",
      "  ├── calculator (or use excel): 1\n",
      "  ├── computer algebra system: 1\n",
      "  ├── video processing software: 1\n",
      "  ├── audio processing software: 1\n",
      "  ├── computer vision: 1\n",
      "  ├── google maps: 1\n",
      "  ├── access to excel files: 1\n",
      "  ├── calculator (or ability to count): 1\n",
      "  ├── a file interface: 3\n",
      "  ├── a python ide: 1\n",
      "  ├── spreadsheet editor: 1\n",
      "  ├── tools required: 1\n",
      "  ├── b browser: 1\n",
      "  ├── image recognition and processing tools: 1\n",
      "  ├── computer vision or ocr: 1\n",
      "  ├── c++ compiler: 1\n",
      "  ├── access to google maps: 1\n",
      "  ├── youtube player: 1\n",
      "  ├── natural language processor: 1\n",
      "  ├── graph interaction tools: 1\n",
      "  ├── bablyonian cuniform -> arabic legend: 1\n",
      "  ├── access to youtube: 1\n",
      "  ├── image search tools: 1\n",
      "  ├── calculator or counting function: 1\n",
      "  ├── a speech-to-text audio processing tool: 1\n",
      "  ├── access to academic journal websites: 1\n",
      "  ├── pdf reader/extracter: 1\n",
      "  ├── rubik's cube model: 1\n",
      "  ├── wikipedia: 1\n",
      "  ├── video capability: 1\n",
      "  ├── image processing tools: 1\n",
      "  ├── age recognition software: 1\n",
      "  ├── youtube: 1\n"
     ]
    }
   ],
   "source": [
    "# list of the tools used in all the samples\n",
    "from collections import Counter, OrderedDict\n",
    "\n",
    "tools = []\n",
    "for sample in json_QA:\n",
    "    for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
    "        tool = tool[2:].strip().lower()\n",
    "        if tool.startswith(\"(\"):\n",
    "            tool = tool[11:].strip()\n",
    "        tools.append(tool)\n",
    "tools_counter = OrderedDict(Counter(tools))\n",
    "print(\"List of tools used in all samples:\")\n",
    "print(\"Total number of tools used:\", len(tools_counter))\n",
    "for tool, count in tools_counter.items():\n",
    "    print(f\"  ├── {tool}: {count}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}