{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a600d7fc", "metadata": {}, "outputs": [], "source": [ "import json \n", "with open('metadata.jsonl', 'r') as f: \n", " json_list = list(f)\n", "\n", "json_QA = []\n", "for json_str in json_list: \n", " json_data = json.loads(json_str)\n", " json_QA.append(json_data)" ] }, { "cell_type": "code", "execution_count": 2, "id": "fa5d8eb8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==================================================\n", "Task ID: 853c8244-429e-46ca-89f2-addf40dfb2bd\n", "Question: In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the \"twelve animals of the Chinese zodiac\" have a hand visible?\n", "Level: 2\n", "Final Answer: 11\n", "Annotator Metadata: \n", " ├── Steps: \n", " │ ├── 1. Search \"2015 Chinese zodiac animal\" on Google search.\n", " │ ├── 2. Note the animal (ram).\n", " │ ├── 3. Search \"Metropolitan Museum of Art\" on Google search.\n", " │ ├── 4. Open the Metropolitan Museum of Art website.\n", " │ ├── 5. Click \"Exhibitions\" under \"Exhibitions and Events\" \n", " │ ├── 6. Click \"Past\".\n", " │ ├── 7. Set the year to 2015.\n", " │ ├── 8. Scroll to find the exhibit mentioning rams and click \"Celebration of the Year of the Ram\".\n", " │ ├── 9. Click \"View All Objects\".\n", " │ ├── 10. Click \"Twelve animals of the Chinese zodiac\" to open the image.\n", " │ ├── 11. Count how many have a visible hand.\n", " ├── Number of steps: 11\n", " ├── How long did this take?: 10 minutes\n", " ├── Tools:\n", " │ ├── 1. Web browser\n", " │ ├── 2. Search engine\n", " │ ├── 3. Image recognition tools\n", " └── Number of tools: 3\n", "==================================================\n" ] } ], "source": [ "import random\n", "random_samples = random.sample(json_QA, 1)\n", "for sample in random_samples:\n", " print(\"=\" * 50)\n", " print(f\"Task ID: {sample['task_id']}\")\n", " print(f\"Question: {sample['Question']}\")\n", " print(f\"Level: {sample['Level']}\")\n", " print(f\"Final Answer: {sample['Final answer']}\")\n", " print(f\"Annotator Metadata: \")\n", " print(f\" ├── Steps: \")\n", " for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n", " print(f\" │ ├── {step}\")\n", " print(f\" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n", " print(f\" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n", " print(f\" ├── Tools:\")\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " print(f\" │ ├── {tool}\")\n", " print(f\" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n", "print(\"=\" * 50)" ] }, { "cell_type": "code", "execution_count": 3, "id": "05076516", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\franc\\repos\\gaia-agent\\gaia-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "c:\\Users\\franc\\repos\\gaia-agent\\gaia-agent\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\franc\\.cache\\huggingface\\hub\\models--sentence-transformers--all-mpnet-base-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n", "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n" ] } ], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_community.vectorstores import SupabaseVectorStore\n", "from supabase.client import Client, create_client\n", "\n", "\n", "load_dotenv()\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") # dim=768\n", "\n", "supabase_url = os.environ.get(\"SUPABASE_URL\")\n", "supabase_key = os.environ.get(\"SUPABASE_SERVICE_ROLE_KEY\")\n", "supabase: Client = create_client(supabase_url, supabase_key)" ] }, { "cell_type": "code", "execution_count": 5, "id": "aa1402e3", "metadata": {}, "outputs": [], "source": [ "from langchain.schema import Document\n", "docs = []\n", "cnt = 0 \n", "for sample in json_QA:\n", " content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n", " doc = {\n", " \"id\" : cnt,\n", " \"content\" : content,\n", " \"metadata\" : {\n", " \"source\" : sample['task_id']\n", " },\n", " \"embedding\" : embeddings.embed_query(content),\n", " }\n", " docs.append(doc)\n", " cnt += 1\n", "\n", "# upload the documents to the vector database\n", "try:\n", " response = (\n", " supabase.table(\"documents\")\n", " .insert(docs)\n", " .execute()\n", " )\n", "except Exception as exception:\n", " print(\"Error inserting data into Supabase:\", exception)\n", "\n", "# # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase\n", "# import pandas as pd\n", "# df = pd.DataFrame(docs)\n", "# df.to_csv('supabase_docs.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 7, "id": "9aa7eb5e", "metadata": {}, "outputs": [], "source": [ "# add items to vector database\n", "vector_store = SupabaseVectorStore(\n", " client=supabase,\n", " embedding= embeddings,\n", " table_name=\"documents\",\n", " query_name=\"match_documents_langchain\",\n", ")\n", "retriever = vector_store.as_retriever()" ] }, { "cell_type": "code", "execution_count": 8, "id": "9eecafd1", "metadata": {}, "outputs": [], "source": [ "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n", "# matched_docs = vector_store.similarity_search(query, k=2)\n", "docs = retriever.invoke(query)" ] }, { "cell_type": "code", "execution_count": 43, "id": "ff917840", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer : 80GSFC21M0002')" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs[0]" ] }, { "cell_type": "code", "execution_count": 44, "id": "01c8f337", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "List of tools used in all samples:\n", "Total number of tools used: 83\n", " ├── web browser: 107\n", " ├── image recognition tools (to identify and parse a figure with three axes): 1\n", " ├── search engine: 101\n", " ├── calculator: 34\n", " ├── unlambda compiler (optional): 1\n", " ├── a web browser.: 2\n", " ├── a search engine.: 2\n", " ├── a calculator.: 1\n", " ├── microsoft excel: 5\n", " ├── google search: 1\n", " ├── ne: 9\n", " ├── pdf access: 7\n", " ├── file handling: 2\n", " ├── python: 3\n", " ├── image recognition tools: 12\n", " ├── jsonld file access: 1\n", " ├── video parsing: 1\n", " ├── python compiler: 1\n", " ├── video recognition tools: 3\n", " ├── pdf viewer: 7\n", " ├── microsoft excel / google sheets: 3\n", " ├── word document access: 1\n", " ├── tool to extract text from images: 1\n", " ├── a word reversal tool / script: 1\n", " ├── counter: 1\n", " ├── excel: 3\n", " ├── image recognition: 5\n", " ├── color recognition: 3\n", " ├── excel file access: 3\n", " ├── xml file access: 1\n", " ├── access to the internet archive, web.archive.org: 1\n", " ├── text processing/diff tool: 1\n", " ├── gif parsing tools: 1\n", " ├── a web browser: 7\n", " ├── a search engine: 7\n", " ├── a speech-to-text tool: 2\n", " ├── code/data analysis tools: 1\n", " ├── audio capability: 2\n", " ├── pdf reader: 1\n", " ├── markdown: 1\n", " ├── a calculator: 5\n", " ├── access to wikipedia: 3\n", " ├── image recognition/ocr: 3\n", " ├── google translate access: 1\n", " ├── ocr: 4\n", " ├── bass note data: 1\n", " ├── text editor: 1\n", " ├── xlsx file access: 1\n", " ├── powerpoint viewer: 1\n", " ├── csv file access: 1\n", " ├── calculator (or use excel): 1\n", " ├── computer algebra system: 1\n", " ├── video processing software: 1\n", " ├── audio processing software: 1\n", " ├── computer vision: 1\n", " ├── google maps: 1\n", " ├── access to excel files: 1\n", " ├── calculator (or ability to count): 1\n", " ├── a file interface: 3\n", " ├── a python ide: 1\n", " ├── spreadsheet editor: 1\n", " ├── tools required: 1\n", " ├── b browser: 1\n", " ├── image recognition and processing tools: 1\n", " ├── computer vision or ocr: 1\n", " ├── c++ compiler: 1\n", " ├── access to google maps: 1\n", " ├── youtube player: 1\n", " ├── natural language processor: 1\n", " ├── graph interaction tools: 1\n", " ├── bablyonian cuniform -> arabic legend: 1\n", " ├── access to youtube: 1\n", " ├── image search tools: 1\n", " ├── calculator or counting function: 1\n", " ├── a speech-to-text audio processing tool: 1\n", " ├── access to academic journal websites: 1\n", " ├── pdf reader/extracter: 1\n", " ├── rubik's cube model: 1\n", " ├── wikipedia: 1\n", " ├── video capability: 1\n", " ├── image processing tools: 1\n", " ├── age recognition software: 1\n", " ├── youtube: 1\n" ] } ], "source": [ "# list of the tools used in all the samples\n", "from collections import Counter, OrderedDict\n", "\n", "tools = []\n", "for sample in json_QA:\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " tool = tool[2:].strip().lower()\n", " if tool.startswith(\"(\"):\n", " tool = tool[11:].strip()\n", " tools.append(tool)\n", "tools_counter = OrderedDict(Counter(tools))\n", "print(\"List of tools used in all samples:\")\n", "print(\"Total number of tools used:\", len(tools_counter))\n", "for tool, count in tools_counter.items():\n", " print(f\" ├── {tool}: {count}\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 5 }