{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b9471aa1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "load_dotenv(override=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ff4eb891",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai_api_key = os.getenv('OPENAI_API_KEY')\n",
    "google_api_key = os.getenv('GOOGLE_API_KEY')\n",
    "deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n",
    "groq_api_key = os.getenv('GROQ_API_KEY')    \n",
    "\n",
    "challenge_question_prompt = \"\"\"Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence.\n",
    "Answer only with the question, no explanation.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "94877c65",
   "metadata": {},
   "outputs": [],
   "source": [
    "def challenge_question(challenge_question_prompt):\n",
    "    messages = [\n",
    "        {\"role\": \"user\", \"content\": challenge_question_prompt}\n",
    "    ]\n",
    "\n",
    "    challenge_question = OpenAI(api_key=openai_api_key).chat.completions.create(\n",
    "        model=\"gpt-4o-mini\",\n",
    "        messages=messages\n",
    "    ).choices[0].message.content\n",
    "\n",
    "\n",
    "    display(Markdown(challenge_question))\n",
    "    return challenge_question"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8631a755",
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [\"gpt-4o-mini\", \"deepseek-chat\", \"gemini-2.0-flash\", \"llama-3.3-70b-versatile\"]\n",
    "api_urls = [\"https://api.openai.com/v1/\", \"https://api.deepseek.com/v1\", \"https://generativelanguage.googleapis.com/v1beta/openai/\", \"https://api.groq.com/openai/v1\"]\n",
    "api_keys = [openai_api_key, deepseek_api_key, google_api_key, groq_api_key]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ddcdbfb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "answers = []\n",
    "\n",
    "def answer_challenge_question(model, url, api_key, challenge_question):\n",
    "    messages = [{\"role\":\"user\", \"content\": challenge_question}]\n",
    "    answer = OpenAI(api_key=api_key, base_url=url).chat.completions.create(\n",
    "        model=model, \n",
    "        messages=messages\n",
    "    ).choices[0].message.content\n",
    "    answers.append(answer)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "97807e26",
   "metadata": {},
   "outputs": [],
   "source": [
    "import threading\n",
    "\n",
    "def ask_question_to_llm(challenge_question):\n",
    "    for index in range(len(models)):\n",
    "        thread = threading.Thread(target=answer_challenge_question, args=[models[index], api_urls[index], api_keys[index], challenge_question])\n",
    "        thread.start()\n",
    "        thread.join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "aebed0c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import dis\n",
    "\n",
    "\n",
    "def judge_llms(challenge_question_prompt, answers):\n",
    "    results = ''\n",
    "    for index, answer in enumerate(answers):\n",
    "        results += f\"Response from competitor model: {models[index]}\\n\\n\"\n",
    "        results += answer + \"\\n\\n\"\n",
    "\n",
    "\n",
    "    judge_prompt = f\"\"\"You are judging a competition between {len(models)} competitors.\n",
    "    Each model has been given this question:\n",
    "\n",
    "    {challenge_question_prompt}\n",
    "\n",
    "    Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.\n",
    "    Respond with JSON, and only JSON, with the following format:\n",
    "    {{\"results\": [\"best competitor model\", \"second best competitor model\", \"third best competitor model\", ...]}}\n",
    "\n",
    "    Here are the responses from each competitor:\n",
    "\n",
    "    {results}\n",
    "\n",
    "    Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks.\"\"\"\n",
    "\n",
    "    display(Markdown(judge_prompt))\n",
    "\n",
    "    messages = [{\"role\": \"user\", \"content\": judge_prompt}]\n",
    "    judge = OpenAI(api_key=openai_api_key).chat.completions.create(\n",
    "        model=\"o3-mini\", \n",
    "        messages=messages\n",
    "    ).choices[0].message.content\n",
    "    display(Markdown(judge))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d73b6507",
   "metadata": {},
   "outputs": [],
   "source": [
    "challenge_question = challenge_question(challenge_question_prompt)\n",
    "ask_question_to_llm(challenge_question)\n",
    "judge_llms(challenge_question_prompt=challenge_question_prompt, answers=answers)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}