Spaces:

dleandro
/

agent-gaia

Sleeping

File size: 32,831 Bytes

0feea5d

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1e417d15",
   "metadata": {},
   "source": [
    "# **Explore GAIA Questions Data**\n",
    "\n",
    "Explore the `metadata.jsonl` file in order to gain a deeper comprehension of the dataset."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a696d11",
   "metadata": {},
   "source": [
    "#### **Imports**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "id": "d3e11d83",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import json\n",
    "import random\n",
    "import psycopg2\n",
    "import pandas as pd\n",
    "from collections import Counter, OrderedDict\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "from huggingface_hub import login\n",
    "\n",
    "from langchain.schema import Document\n",
    "from langchain_community.retrievers import BM25Retriever\n",
    "from langchain.tools import Tool, StructuredTool\n",
    "from langchain_core.tools import tool\n",
    "from langchain_huggingface import HuggingFaceEmbeddings\n",
    "from langchain_community.vectorstores import SupabaseVectorStore\n",
    "\n",
    "from supabase import Client, create_client\n",
    "from supabase.client import ClientOptions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "id": "17734566",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of QAs: 165\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'task_id': 'c61d22de-5f6c-4958-a7f6-5e9707bd3466',\n",
       " 'Question': 'A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?',\n",
       " 'Level': 2,\n",
       " 'Final answer': 'egalitarian',\n",
       " 'file_name': '',\n",
       " 'Annotator Metadata': {'Steps': '1. Go to arxiv.org and navigate to the Advanced Search page.\\n2. Enter \"AI regulation\" in the search box and select \"All fields\" from the dropdown.\\n3. Enter 2022-06-01 and 2022-07-01 into the date inputs, select \"Submission date (original)\", and submit the search.\\n4. Go through the search results to find the article that has a figure with three axes and labels on each end of the axes, titled \"Fairness in Agreement With European Values: An Interdisciplinary Perspective on AI Regulation\".\\n5. Note the six words used as labels: deontological, egalitarian, localized, standardized, utilitarian, and consequential.\\n6. Go back to arxiv.org\\n7. Find \"Physics and Society\" and go to the page for the \"Physics and Society\" category.\\n8. Note that the tag for this category is \"physics.soc-ph\".\\n9. Go to the Advanced Search page.\\n10. Enter \"physics.soc-ph\" in the search box and select \"All fields\" from the dropdown.\\n11. Enter 2016-08-11 and 2016-08-12 into the date inputs, select \"Submission date (original)\", and submit the search.\\n12. Search for instances of the six words in the results to find the paper titled \"Phase transition from egalitarian to hierarchical societies driven by competition between cognitive and social constraints\", indicating that \"egalitarian\" is the correct answer.',\n",
       "  'Number of steps': '12',\n",
       "  'How long did this take?': '8 minutes',\n",
       "  'Tools': '1. Web browser\\n2. Image recognition tools (to identify and parse a figure with three axes)',\n",
       "  'Number of tools': '2'}}"
      ]
     },
     "execution_count": 194,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(\"metadata.jsonl\") as dataset_file:\n",
    "    json_list = list(dataset_file)\n",
    "\n",
    "QAs = [json.loads(qa) for qa in json_list]\n",
    "print(f\"Number of QAs: {len(QAs)}\")\n",
    "QAs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "40328df2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TaskId: 7a4a336d-dcfa-45a0-b014-824c7619e8de\n",
      "Level: 2\n",
      "Question: At the two-minute mark in the YouTube video uploaded by the channel “GameGrumps” on May 14, 2017 as part of their playthrough of the game Mario Kart 8 Deluxe, the shows’ hosts are competing on one of the game’s racetracks. What was the world record time for that track in the game’s 150cc mode as of June 7, 2023? Express your answer in minutes and seconds, rounding the seconds to the nearest hundredth, e.g. 1:01.001.\n",
      "Ground Truth: 1:41.614\n",
      "Additional file: \n",
      "Annotator Metadata:\n",
      " - Steps:\n",
      "    1. Search the web for “gamegrumps mario kart 8 deluxe may 14 2017”.\n",
      "    2. Click on the YouTube video result.\n",
      "    3. Navigate to two minutes into the video.\n",
      "    4. Scroll further back until I see the name of the racecourse, Yoshi Circuit.\n",
      "    5. Search the web for “mario kart 8 deluxe yoshi circuit world record 150cc”\n",
      "    6. Scroll down until I find a reliable world record listing site.\n",
      "    7. Navigate through the site until I find the record that meets the specified criteria.\n",
      "    8. Read the date the record was set to confirm that it applies to the question’s specified date.\n",
      " - Number of steps: 8\n",
      " - How long did this take: 5-10 minutes\n",
      " - Tools [4]:\n",
      "    1. Search engine\n",
      "    2. Web browser\n",
      "    3. YouTube\n",
      "    4. OCR\n",
      "- Number of tools: 4\n"
     ]
    }
   ],
   "source": [
    "random_samples = random.sample(QAs, 1)\n",
    "for samp in random_samples:\n",
    "    print(\n",
    "        f\"TaskId: {samp['task_id']}\\nLevel: {samp['Level']}\\n\"\n",
    "        f\"Question: {samp['Question']}\\nGround Truth: {samp['Final answer']}\\n\"\n",
    "        f\"Additional file: {samp['file_name']}\"\n",
    "    )\n",
    "    print(\"Annotator Metadata:\")\n",
    "    print(\" - Steps:\")\n",
    "    metadata = samp['Annotator Metadata']\n",
    "    steps = metadata['Steps'].split(\"\\n\")\n",
    "    for step in steps:\n",
    "        print(f\"    {step}\")\n",
    "    print(f\" - Number of steps: {metadata['Number of steps']}\")\n",
    "    print(f\" - How long did this take: {metadata['How long did this take?']}\")\n",
    "    tools = metadata['Tools'].split(\"\\n\")\n",
    "    print(f\" - Tools [{len(tools)}]:\")\n",
    "    for t in tools:\n",
    "        print(f\"    {t}\")\n",
    "    print(f\"- Number of tools: {metadata['Number of tools']}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52ca1954",
   "metadata": {},
   "source": [
    "As we can see, the `Dataset` contains:\n",
    "\n",
    "- **task_id** : The unique identifier for the task\n",
    "\n",
    "- **Level** : Difficulty level of the GAIA task\n",
    "\n",
    "- **Question** : The specific GAIA task\n",
    "\n",
    "- **Final answer** : The ground truth for the GAIA task\n",
    "\n",
    "- **file_name** : The additional file related to the task\n",
    "\n",
    "- **Annotator Metadata** : \n",
    "\n",
    "    - **Steps** : The **sequence** of steps followed to accomplish the correct answer\n",
    "\n",
    "    - **Number of steps** : Total number of steps to accomplish the correct answer\n",
    "\n",
    "    - **Tools** : The list of `tools` used to answer the question/task\n",
    "\n",
    "    - **Number of tools** : Total number of tools used"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ccc5f181",
   "metadata": {},
   "source": [
    "**GAIA Agent** must be an `Agentic RAG`. This way the agent will be able to combine retrieval system, accessing the QAs `dataset`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfe371b8",
   "metadata": {},
   "source": [
    "#### **Explore Dataset Tools Types**\n",
    "\n",
    "Since the *`dataset`* provides for each question a list of `Tools` used to reaching the final answer, it is useful to explore these tools in order to define an efficient and relevant set of tools for our agent to incorporate:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "id": "f470a028",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of Tools used in entire set: 55\n",
      "Tools used in QAs:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tool</th>\n",
       "      <th>Count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>SEARCH ENGINE</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CALCULATOR</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>WEB BROWSER</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NE</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>IMAGE RECOGNITION TOOLS</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>PDF VIEWER</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>A CALCULATOR</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>OCR</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>VIDEO RECOGNITION TOOLS</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>MICROSOFT EXCEL</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>PDF ACCESS</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>MICROSOFT EXCEL / GOOGLE SHEETS</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>IMAGE RECOGNITION</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>A SPEECH-TO-TEXT TOOL</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>A SEARCH ENGINE</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>IMAGE RECOGNITION/OCR</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>GOOGLE MAPS</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>SPREADSHEET EDITOR</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>TOOLS REQUIRED</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>B BROWSER</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               Tool  Count\n",
       "0                     SEARCH ENGINE     35\n",
       "1                        CALCULATOR     33\n",
       "2                       WEB BROWSER     12\n",
       "3                                NE      9\n",
       "4           IMAGE RECOGNITION TOOLS      8\n",
       "5                        PDF VIEWER      6\n",
       "6                      A CALCULATOR      5\n",
       "7                               OCR      3\n",
       "8           VIDEO RECOGNITION TOOLS      3\n",
       "9                   MICROSOFT EXCEL      2\n",
       "10                       PDF ACCESS      2\n",
       "11  MICROSOFT EXCEL / GOOGLE SHEETS      2\n",
       "12                IMAGE RECOGNITION      2\n",
       "13            A SPEECH-TO-TEXT TOOL      2\n",
       "14                  A SEARCH ENGINE      1\n",
       "15            IMAGE RECOGNITION/OCR      1\n",
       "16                      GOOGLE MAPS      1\n",
       "17               SPREADSHEET EDITOR      1\n",
       "18                   TOOLS REQUIRED      1\n",
       "19                        B BROWSER      1"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tools_qa = []\n",
    "for qa in QAs:\n",
    "    for t in qa[\"Annotator Metadata\"][\"Tools\"].split(\"\\n\"):\n",
    "        tool_qa = t[2:].strip().upper()\n",
    "        tool_qa = re.sub(r\"\\s*\\([^)]*\\)\\s*\", \"\", tool_qa)\n",
    "    tools_qa.append(tool_qa)\n",
    "tools_counter = OrderedDict(Counter(tools_qa))\n",
    "\n",
    "print(f\"Total number of Tools used in entire set: {len(tools_counter)}\")\n",
    "print(\"Tools used in QAs:\")\n",
    "df = pd.DataFrame(\n",
    "    list(tools_counter.items()), columns = [\"Tool\", \"Count\"]\n",
    "    ).sort_values(\"Count\", ascending = False)\\\n",
    "    .reset_index(drop = True)\n",
    "df.head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15be46b0",
   "metadata": {},
   "source": [
    "#### **Tools to be Implemented**\n",
    "\n",
    "- `Search Engine` (arXiv, Wikipedia, DuckDuckGo)\n",
    "\n",
    "- `Calculator` (add, substract, divide, multiply, modulus, etc.)\n",
    "\n",
    "- `Access` and `Download Files` from Web\n",
    "\n",
    "- `Excel`/`Google Sheets`: Process Downloaded files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7cec064",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ea41219",
   "metadata": {},
   "source": [
    "## **Project Structure: GAIA Agent**\n",
    "\n",
    "In order to implement our agent within a `Hugging Face Space`, as a structured `Python` project, ensuring clean and modular code organized in different functionalities it is recommended to use separate files. For instance the structure would be:\n",
    "\n",
    "- `tools.py` - To provide the auxiliary tools for the GAIA Agent\n",
    "\n",
    "- `retriever.py` - To implement the retrieval functions to support acces to the knowledge base (*dataset*)\n",
    "\n",
    "- `agent.py` - To implement the agent\n",
    "\n",
    "- `app.py` - To integrate all the components into a fully functional agent"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "62e07469",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31ede212",
   "metadata": {},
   "source": [
    "## **Dataset Loading and Dataset**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "81c53670",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = [\n",
    "    Document(\n",
    "        page_content = \"\\n\".join([\n",
    "            f\"Question: {qa['Question']}\",\n",
    "            f\"Final answer: {qa['Final answer']}\",\n",
    "            # f\"file_name: {qa['file_name']}\",\n",
    "            # f\"Annotator Metadata: {qa['Annotator Metadata']}\"\n",
    "        ]),\n",
    "        metadata = {\"task_id\": qa[\"task_id\"], \"level\": qa['Level']}\n",
    "    )\n",
    "    for qa in QAs\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "44d0020e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(metadata={'task_id': 'c61d22de-5f6c-4958-a7f6-5e9707bd3466', 'level': 2}, page_content='Question: A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?\\nFinal answer: egalitarian')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd2479f3",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9bf2b7f",
   "metadata": {},
   "source": [
    "## **Retrival Tool Creation**\n",
    "\n",
    "There are $2$ options for this:\n",
    "\n",
    "1. ***Semantic Search*** - `BM25Retriever`\n",
    "2. ***Vector Search*** - \n",
    "\n",
    "Let's explore both with the following methods and tools:\n",
    "\n",
    "- **Semantic Search**: `BM25Retriever`\n",
    "- **Vector Search**: `bge-base-en-v1.5` for Embeddings and `Supabase` as *Vector Store*"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27ae6dcc",
   "metadata": {},
   "source": [
    "### **Retriever for Semantic Search**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "920ba41c",
   "metadata": {},
   "outputs": [],
   "source": [
    "bm25_retriever = BM25Retriever.from_documents(documents = docs)\n",
    "bm25_retriever.k = 3\n",
    "\n",
    "# @tool(parse_docstring = True)\n",
    "def retrieve_semantic(query: str) -> str:\n",
    "    \"\"\"\n",
    "    Retrieves information about QA's based on semantic search.\n",
    "\n",
    "    Args:\n",
    "        query (str): The user query.\n",
    "\n",
    "    Returns:\n",
    "        str: The result of the semantic search\n",
    "    \"\"\"\n",
    "    res = bm25_retriever.invoke(query)\n",
    "    if res:\n",
    "        return \"\\n\\n\".join([doc.page_content for doc in res])\n",
    "    else: \n",
    "        return \"No matching information found.\"\n",
    "\n",
    "tool_retrieve_semantic = StructuredTool.from_function(\n",
    "    retrieve_semantic\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d69970f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?\n",
      "Final answer: egalitarian\n",
      "\n",
      "Question: An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?\n",
      "Final answer: Fred\n",
      "\n",
      "Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n",
      "Final answer: 80GSFC21M0002\n",
      "Question: A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?\n",
      "Final answer: egalitarian\n",
      "\n",
      "Question: An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?\n",
      "Final answer: Fred\n",
      "\n",
      "Question: On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n",
      "Final answer: 80GSFC21M0002\n"
     ]
    }
   ],
   "source": [
    "# Comparing outputs\n",
    "print(tool_retrieve_semantic.invoke(QAs[0]['Question']))\n",
    "print(retrieve_semantic(QAs[0]['Question']))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbb40ebd",
   "metadata": {},
   "source": [
    "### **Retriever for Vector Search**\n",
    "\n",
    "For this we must create:\n",
    "- **Table** in `supabase` with extension for `pgvector`\n",
    "- RLS for security"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d152a0b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Logging to HF for downloading Embedding Model\n",
    "\n",
    "load_dotenv()\n",
    "hf_token = os.getenv(\"HF_API_TOKEN\")\n",
    "if hf_token:\n",
    "    login(token = hf_token)\n",
    "else:\n",
    "    print(\"Warning: No Hugging Face token found.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e379ef2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "768"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# MODEL_NAME = \"sentence-transformers/all-mpnet-base-v2\"\n",
    "MODEL_NAME = \"BAAI/bge-base-en-v1.5\"\n",
    "embedding_model = HuggingFaceEmbeddings(model_name = MODEL_NAME)\n",
    "model = embedding_model._client\n",
    "dim = model.get_sentence_embedding_dimension()\n",
    "dim"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "612935dd",
   "metadata": {},
   "source": [
    "#### **Supabase (Postgresql) Table Creation**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "id": "ef54ff9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create postgresql connection\n",
    "conn = psycopg2.connect(\n",
    "    host = os.getenv(\"SUPABASE_DB_HOST\"),\n",
    "    port = os.getenv(\"SUPABASE_DB_PORT\"),\n",
    "    dbname = os.getenv(\"SUPABASE_DB_NAME\"),\n",
    "    user = os.getenv(\"SUPABASE_DB_USER\"),\n",
    "    password = os.getenv(\"SUPABASE_DB_PASSWORD\")\n",
    ")\n",
    "conn.autocommit = True\n",
    "cursor = conn.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "id": "13d1774e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Table documents_tbl' successfully created and ready to insert embeddings.\n"
     ]
    }
   ],
   "source": [
    "TBL_NAME = \"documents_tbl\"\n",
    "create_table = f\"\"\"\n",
    "DROP TABLE IF EXISTS {TBL_NAME};\n",
    "CREATE TABLE IF NOT EXISTS {TBL_NAME} (\n",
    "    id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,\n",
    "    content TEXT,\n",
    "    metadata JSONB,\n",
    "    embedding VECTOR({dim})\n",
    ");\n",
    "\"\"\"\n",
    "try:\n",
    "    cursor.execute(\"CREATE SCHEMA IF NOT EXISTS extensions;\")\n",
    "    cursor.execute(\"CREATE EXTENSION IF NOT EXISTS vector WITH SCHEMA extensions;\")\n",
    "    cursor.execute(create_table)\n",
    "    cursor.execute(f\"ALTER TABLE {TBL_NAME} ENABLE ROW LEVEL SECURITY;\")\n",
    "    print(f\"Table {TBL_NAME}' successfully created and ready to insert embeddings.\")\n",
    "except Exception as e:\n",
    "    conn.rollback()\n",
    "    print(\"Couldn't create the Postgresql table. Error: {e}\")\n",
    "    raise e\n",
    "\n",
    "cursor.execute(f\"\"\"\n",
    "    DROP POLICY IF EXISTS \"Allow read to all\" ON {TBL_NAME};\n",
    "\"\"\")\n",
    "\n",
    "cursor.execute(f\"\"\"\n",
    "CREATE POLICY \"Allow read to all\"\n",
    "ON {TBL_NAME}\n",
    "FOR SELECT\n",
    "USING (true);\n",
    "\"\"\")\n",
    "# cursor.close()\n",
    "# conn.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "667f0966",
   "metadata": {},
   "source": [
    "#### **Function to Seach Documents in Supabase**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "id": "d0fc9a37",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_func_def = f\"\"\"\n",
    "CREATE FUNCTION match_documents (\n",
    "    query_embedding VECTOR({dim}),\n",
    "    filter JSONB DEFAULT '{{}}',\n",
    "    match_count INT DEFAULT 5\n",
    ") RETURNS TABLE (\n",
    "    id BIGINT,\n",
    "    content TEXT,\n",
    "    metadata JSONB,\n",
    "    similarity FLOAT\n",
    ") LANGUAGE plpgsql\n",
    "SET search_path = 'extensions', 'public'\n",
    "AS $$\n",
    "BEGIN\n",
    "    RETURN QUERY\n",
    "    SELECT\n",
    "        {TBL_NAME}.id,\n",
    "        {TBL_NAME}.content,\n",
    "        {TBL_NAME}.metadata,\n",
    "        1 - ({TBL_NAME}.embedding <=> query_embedding) AS similarity\n",
    "    FROM {TBL_NAME}\n",
    "    WHERE {TBL_NAME}.metadata @> filter\n",
    "    ORDER BY {TBL_NAME}.embedding <=> query_embedding\n",
    "    LIMIT match_count;\n",
    "END;\n",
    "$$;\n",
    "\"\"\"\n",
    "\n",
    "cursor.execute(\"DROP FUNCTION IF EXISTS match_documents(VECTOR, JSONB, INT);\")\n",
    "cursor.execute(df_func_def)\n",
    "cursor.execute(f\"GRANT SELECT ON {TBL_NAME} TO anon;\")\n",
    "cursor.execute(\"GRANT EXECUTE ON FUNCTION match_documents(VECTOR, JSONB, INT) TO service_role;\")\n",
    "cursor.execute(\"GRANT EXECUTE ON FUNCTION match_documents(VECTOR, JSONB, INT) TO anon;\")\n",
    "\n",
    "cursor.close()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f8345e4",
   "metadata": {},
   "source": [
    "#### **Data Insertion into Supabase Table**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "id": "16cd7045",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embedding first 5 dims: [0.006851373240351677, 0.019783932715654373, -0.005305973347276449, 0.04809008538722992, 0.03095371648669243]\n"
     ]
    }
   ],
   "source": [
    "docs_qa = []\n",
    "for i, qa in enumerate(QAs):\n",
    "    question = qa.get(\"Question\", \"\").strip()\n",
    "    final_answer = qa.get(\"Final answer\", \"\").strip()\n",
    "    additional_file = qa.get(\"file_name\")\n",
    "    has_file = additional_file != \"\"\n",
    "\n",
    "    content = f\"Question: {question}\\n\\nAdditional file: {additional_file}\\n\\nFinal answer: {final_answer}\"\n",
    "    embedding = embedding_model.embed_query(content)\n",
    "    doc_qa = {\n",
    "        \"content\": content,\n",
    "        \"metadata\": {\n",
    "            \"task_id\": qa.get(\"task_id\"),\n",
    "            \"has_file\": has_file\n",
    "        },\n",
    "        \"embedding\": embedding\n",
    "    }\n",
    "\n",
    "    if i == 0:\n",
    "        print(f\"Embedding first 5 dims: {embedding[:5]}\")\n",
    "    \n",
    "    docs_qa.append(doc_qa)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1278686d",
   "metadata": {},
   "source": [
    "Intantiate **Supabase** `Client`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "id": "edc9e4f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
    "supabase_key = os.environ.get(\"SUPABASE_KEY\")\n",
    "supabase_anon_key = os.environ.get(\"SUPABASE_ANON_KEY\")\n",
    "supabase: Client = create_client(\n",
    "    supabase_url, supabase_key,\n",
    "    options = ClientOptions(\n",
    "        schema = \"public\"\n",
    "    )\n",
    ")\n",
    "\n",
    "supabase_public: Client = create_client(\n",
    "    supabase_url, supabase_anon_key,\n",
    "    options = ClientOptions(\n",
    "        schema = \"public\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6699531f",
   "metadata": {},
   "source": [
    "Upload *Documents* to the `Vector Database` (*Supabase*):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "52df5e9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "try: \n",
    "    res = (\n",
    "        supabase\n",
    "        .table(TBL_NAME)\n",
    "        .insert(docs_qa)\n",
    "        .execute()\n",
    "    )\n",
    "    if len(res.data) != len(docs):\n",
    "        print(f\"Warning: Only {len(res.data)} out of {len(docs)} docs were inserted.\")\n",
    "except Exception as e:\n",
    "    print(f\"Error inserting documents into Supabase:\\n{e}\")\n",
    "    raise"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "592f110d",
   "metadata": {},
   "source": [
    "#### **Supabase Vector Store**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "3684e86d",
   "metadata": {},
   "outputs": [],
   "source": [
    "vector_store = SupabaseVectorStore(\n",
    "    client = supabase_public,\n",
    "    embedding = embedding_model,\n",
    "    table_name = TBL_NAME,\n",
    "    query_name = \"match_documents\"\n",
    ")\n",
    "vector_retriever = vector_store.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "id": "459d8049",
   "metadata": {},
   "outputs": [],
   "source": [
    "vector_store = SupabaseVectorStore(\n",
    "    client = supabase,\n",
    "    embedding = embedding_model,\n",
    "    table_name = TBL_NAME,\n",
    "    query_name = \"match_documents\"\n",
    ")\n",
    "vector_retriever = vector_store.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "id": "5cfb08cf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question:\n",
      "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?\n",
      "\n",
      "Answer:\n",
      "Louvrier\n"
     ]
    }
   ],
   "source": [
    "r_samp = random.sample(QAs, 1)[0]\n",
    "query = r_samp['Question']\n",
    "r_ans = r_samp['Final answer']\n",
    "\n",
    "print(f\"Question:\\n{query}\\n\\nAnswer:\\n{r_ans}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "id": "539c374b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(metadata={'task_id': 'cabe07ed-9eca-40ea-8ead-410ef5e83f91', 'has_file': False}, page_content=\"Question: What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?\\n\\nAdditional file: \\n\\nFinal answer: Louvrier\")"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cntx = vector_retriever.invoke(query)\n",
    "cntx[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}