{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "def load_data(file_path):\n", " \"\"\"Loads data from a JSONL file.\"\"\"\n", " data = []\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " for line in f:\n", " data.append(json.loads(line))\n", " return data\n", "\n", "def search_and_print_task(task_id, data):\n", " \"\"\"\n", " Searches for a task by its ID and prints it in a formatted way.\n", " \"\"\"\n", " found_task = None\n", " for sample in data:\n", " if sample['task_id'] == task_id:\n", " found_task = sample\n", " break\n", "\n", " if not found_task:\n", " print(f\"Task with ID '{task_id}' not found.\")\n", " return\n", "\n", " print(\"=\" * 50)\n", " print(f\"Task ID: {found_task.get('task_id', 'N/A')}\")\n", " print(f\"Question: {found_task.get('Question', 'N/A')}\")\n", " print(f\"Level: {found_task.get('Level', 'N/A')}\")\n", " print(f\"Final Answer: {found_task.get('Final answer', 'N/A')}\")\n", "\n", " metadata = found_task.get('Annotator Metadata', {})\n", " if metadata:\n", " print(f\"Annotator Metadata: \")\n", " \n", " steps = metadata.get('Steps')\n", " if steps:\n", " print(f\" ├── Steps: \")\n", " for step in steps.split('\\\\n'):\n", " print(f\" │ ├── {step}\")\n", " \n", " num_steps = metadata.get('Number of steps')\n", " if num_steps is not None:\n", " print(f\" ├── Number of steps: {num_steps}\")\n", " \n", " duration = metadata.get('How long did this take?')\n", " if duration:\n", " print(f\" ├── How long did this take?: {duration}\")\n", " \n", " tools = metadata.get('Tools')\n", " if tools:\n", " print(f\" ├── Tools:\")\n", " for tool in tools.split('\\\\n'):\n", " print(f\" │ ├── {tool}\")\n", "\n", " num_tools = metadata.get('Number of tools')\n", " if num_tools is not None:\n", " print(f\" └── Number of tools: {num_tools}\")\n", " \n", " print(\"=\" * 50)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "# 1. Load the data\n", "json_QA = load_data('metadata.jsonl')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "\n", "# 2. Choose a task_id to search for. \n", "# I'll use the first one from the file as an example.\n", "example_task_id = \"8e867cd7-cff9-4e6c-867a-ff5ddc2550be\"\n", "\n", "# 3. Call the function with the task_id\n", "search_and_print_task(example_task_id, json_QA)" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": {}, "outputs": [], "source": [ "import json \n", "with open('metadata.jsonl', 'r') as f: \n", " json_list = list(f)\n", "\n", "json_QA = []\n", "for json_str in json_list: \n", " json_data = json.loads(json_str)\n", " json_QA.append(json_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "4", "metadata": {}, "outputs": [], "source": [ "# import specific question" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": {}, "outputs": [], "source": [ "import random\n", "random_samples = random.sample(json_QA, 2)\n", "for sample in random_samples:\n", " print(\"=\" * 50)\n", " print(f\"Task ID: {sample['task_id']}\")\n", " print(f\"Question: {sample['Question']}\")\n", " print(f\"Level: {sample['Level']}\")\n", " print(f\"Final Answer: {sample['Final answer']}\")\n", " print(f\"Annotator Metadata: \")\n", " print(f\" ├── Steps: \")\n", " for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n", " print(f\" │ ├── {step}\")\n", " print(f\" ├── Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n", " print(f\" ├── How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n", " print(f\" ├── Tools:\")\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " print(f\" │ ├── {tool}\")\n", " print(f\" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n", "print(\"=\" * 50)" ] }, { "cell_type": "code", "execution_count": null, "id": "6", "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from langchain_community.vectorstores import SupabaseVectorStore\n", "from supabase.client import Client, create_client\n", "\n", "\n", "load_dotenv()\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") # dim=768\n", "\n", "supabase_url = os.environ.get(\"SUPABASE_URL\")\n", "supabase_key = os.environ.get(\"SUPABASE_SERVICE_ROLE_KEY\")\n", "supabase: Client = create_client(supabase_url, supabase_key)" ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "from langchain.schema import Document\n", "docs = []\n", "cnt = 0 \n", "for sample in json_QA:\n", " content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n", " doc = {\n", " \"id\" : cnt,\n", " \"content\" : content,\n", " \"metadata\" : {\n", " \"source\" : sample['task_id']\n", " },\n", " \"embedding\" : embeddings.embed_query(content),\n", " }\n", " docs.append(doc)\n", " cnt += 1\n", "\n", "# upload the documents to the vector database\n", "try:\n", " response = (\n", " supabase.table(\"documents2\")\n", " .insert(docs)\n", " .execute()\n", " )\n", "except Exception as exception:\n", " print(\"Error inserting data into Supabase:\", exception)\n", "\n", "# # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase\n", "# import pandas as pd\n", "# df = pd.DataFrame(docs)\n", "# df.to_csv('supabase_docs.csv',index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "8", "metadata": {}, "outputs": [], "source": [ "# add items to vector database\n", "vector_store = SupabaseVectorStore(\n", " client=supabase,\n", " embedding= embeddings,\n", " table_name=\"documents2\",\n", " query_name=\"match_documents_2\",\n", ")\n", "retriever = vector_store.as_retriever()" ] }, { "cell_type": "code", "execution_count": null, "id": "9", "metadata": {}, "outputs": [], "source": [ "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n", "# matched_docs = vector_store.similarity_search(query, k=2)\n", "docs = retriever.invoke(query)" ] }, { "cell_type": "code", "execution_count": null, "id": "10", "metadata": {}, "outputs": [], "source": [ "docs[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "11", "metadata": {}, "outputs": [], "source": [ "# list of the tools used in all the samples\n", "from collections import Counter, OrderedDict\n", "\n", "tools = []\n", "for sample in json_QA:\n", " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n", " tool = tool[2:].strip().lower()\n", " if tool.startswith(\"(\"):\n", " tool = tool[11:].strip()\n", " tools.append(tool)\n", "tools_counter = OrderedDict(Counter(tools))\n", "print(\"List of tools used in all samples:\")\n", "print(\"Total number of tools used:\", len(tools_counter))\n", "for tool, count in tools_counter.items():\n", " print(f\" ├── {tool}: {count}\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }