{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b9471aa1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "from IPython.display import Markdown, display\n", "\n", "load_dotenv(override=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "ff4eb891", "metadata": {}, "outputs": [], "source": [ "openai_api_key = os.getenv('OPENAI_API_KEY')\n", "google_api_key = os.getenv('GOOGLE_API_KEY')\n", "deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n", "groq_api_key = os.getenv('GROQ_API_KEY') \n", "\n", "challenge_question_prompt = \"\"\"Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence.\n", "Answer only with the question, no explanation.\"\"\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "94877c65", "metadata": {}, "outputs": [], "source": [ "def challenge_question(challenge_question_prompt):\n", " messages = [\n", " {\"role\": \"user\", \"content\": challenge_question_prompt}\n", " ]\n", "\n", " challenge_question = OpenAI(api_key=openai_api_key).chat.completions.create(\n", " model=\"gpt-4o-mini\",\n", " messages=messages\n", " ).choices[0].message.content\n", "\n", "\n", " display(Markdown(challenge_question))\n", " return challenge_question" ] }, { "cell_type": "code", "execution_count": 4, "id": "8631a755", "metadata": {}, "outputs": [], "source": [ "models = [\"gpt-4o-mini\", \"deepseek-chat\", \"gemini-2.0-flash\", \"llama-3.3-70b-versatile\"]\n", "api_urls = [\"https://api.openai.com/v1/\", \"https://api.deepseek.com/v1\", \"https://generativelanguage.googleapis.com/v1beta/openai/\", \"https://api.groq.com/openai/v1\"]\n", "api_keys = [openai_api_key, deepseek_api_key, google_api_key, groq_api_key]" ] }, { "cell_type": "code", "execution_count": 5, "id": "ddcdbfb1", "metadata": {}, "outputs": [], "source": [ "answers = []\n", "\n", "def answer_challenge_question(model, url, api_key, challenge_question):\n", " messages = [{\"role\":\"user\", \"content\": challenge_question}]\n", " answer = OpenAI(api_key=api_key, base_url=url).chat.completions.create(\n", " model=model, \n", " messages=messages\n", " ).choices[0].message.content\n", " answers.append(answer)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "97807e26", "metadata": {}, "outputs": [], "source": [ "import threading\n", "\n", "def ask_question_to_llm(challenge_question):\n", " for index in range(len(models)):\n", " thread = threading.Thread(target=answer_challenge_question, args=[models[index], api_urls[index], api_keys[index], challenge_question])\n", " thread.start()\n", " thread.join()" ] }, { "cell_type": "code", "execution_count": 7, "id": "aebed0c9", "metadata": {}, "outputs": [], "source": [ "\n", "import dis\n", "\n", "\n", "def judge_llms(challenge_question_prompt, answers):\n", " results = ''\n", " for index, answer in enumerate(answers):\n", " results += f\"Response from competitor model: {models[index]}\\n\\n\"\n", " results += answer + \"\\n\\n\"\n", "\n", "\n", " judge_prompt = f\"\"\"You are judging a competition between {len(models)} competitors.\n", " Each model has been given this question:\n", "\n", " {challenge_question_prompt}\n", "\n", " Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.\n", " Respond with JSON, and only JSON, with the following format:\n", " {{\"results\": [\"best competitor model\", \"second best competitor model\", \"third best competitor model\", ...]}}\n", "\n", " Here are the responses from each competitor:\n", "\n", " {results}\n", "\n", " Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks.\"\"\"\n", "\n", " display(Markdown(judge_prompt))\n", "\n", " messages = [{\"role\": \"user\", \"content\": judge_prompt}]\n", " judge = OpenAI(api_key=openai_api_key).chat.completions.create(\n", " model=\"o3-mini\", \n", " messages=messages\n", " ).choices[0].message.content\n", " display(Markdown(judge))" ] }, { "cell_type": "code", "execution_count": null, "id": "d73b6507", "metadata": {}, "outputs": [], "source": [ "challenge_question = challenge_question(challenge_question_prompt)\n", "ask_question_to_llm(challenge_question)\n", "judge_llms(challenge_question_prompt=challenge_question_prompt, answers=answers)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }