## Welcome to the Second Lab - Week 1, Day 3

Today we will work with lots of models! This is a way to get comfortable with APIs.

In [None]:
# Start with imports - ask ChatGPT to explain any package that you don't know

import os
import json
from dotenv import load_dotenv
from openai import OpenAI, AsyncOpenAI
from IPython.display import Markdown, display
import asyncio
from functools import partial

In [None]:
# Always remember to do this!
load_dotenv(override=True)

In [None]:
# Print the key prefixes to help with any debugging

openai_api_key = os.getenv('OPENAI_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")


if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:2]}")
else:
    print("Google API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")

In [None]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

In [None]:
openai = AsyncOpenAI()
response = await openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages,
)
question = response.choices[0].message.content
print(question)


In [None]:
messages = [{"role": "user", "content": question}]

In [None]:
from dataclasses import dataclass

@dataclass
class LLMResource:
    api_key: str
    model: str
    url: str = None # optional otherwise NOone

llm_resources = [
    LLMResource(api_key=openai_api_key, model="gpt-4o-mini"),
    LLMResource(api_key=google_api_key, model="gemini-2.5-flash", url="https://generativelanguage.googleapis.com/v1beta/openai/"),
    LLMResource(api_key=groq_api_key, model="qwen/qwen3-32b", url="https://api.groq.com/openai/v1"),
    LLMResource(api_key="ollama", model="deepseek-r1:1.5b", url="http://localhost:11434/v1" )
]


In [None]:


async def llm_call(key, model_name, url, messages) -> tuple:
    if url is None:
        llm = AsyncOpenAI(api_key=key)
    else: 
        llm = AsyncOpenAI(base_url=url,api_key=key)
    
    response = await llm.chat.completions.create(
        model=model_name, messages=messages)
    
    answer = (model_name, response.choices[0].message.content)

    return answer #returns tuple of modle and response from LLM

llm_callable = partial(llm_call, messages=messages) #prefill with messages
# Always remember to do this!

In [None]:
#gather all responses concurrently
tasks = [llm_callable(res.api_key,res.model,res.url) for res in llm_resources]
results =  await asyncio.gather(*tasks)
together =  [f'Response from competitor {model}:{answer}' for model,answer in results]#gather results once all model finish running


In [None]:
judge = f"""You are judging a competition between {len(llm_resources)} competitors.
Each model has been given this question:

{request}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together} # all responses

Now respond with the JSON with the ranked order of the competitors name, nothing else. Do not include markdown formatting or code blocks."""

In [None]:
print(judge)

In [None]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
# Judgement time!

openai = OpenAI()
response = openai.chat.completions.create(
    model="o3-mini",
    messages=judge_messages,
)
results = response.choices[0].message.content
print(results)


In [None]:
# OK let's turn this into results!

results_dict = json.loads(results)

ranks = results_dict["results"]

for index, result in enumerate(ranks):
    print(f"Rank {index+1}: {result}")

<table style="margin: 0; text-align: left; width:100%">
    <tr>
        <td style="width: 150px; height: 150px; vertical-align: middle;">
            <img src="../assets/exercise.png" width="150" height="150" style="display: block;" />
        </td>
        <td>
            <h2 style="color:#ff7800;">Exercise</h2>
            <span style="color:#ff7800;">Which pattern(s) did this use? Try updating this to add another Agentic design pattern.
            </span>
        </td>
    </tr>
</table>

<table style="margin: 0; text-align: left; width:100%">
    <tr>
        <td style="width: 150px; height: 150px; vertical-align: middle;">
            <img src="../assets/business.png" width="150" height="150" style="display: block;" />
        </td>
        <td>
            <h2 style="color:#00bfff;">Commercial implications</h2>
            <span style="color:#00bfff;">These kinds of patterns - to send a task to multiple models, and evaluate results,
            are common where you need to improve the quality of your LLM response. This approach can be universally applied
            to business projects where accuracy is critical.
            </span>
        </td>
    </tr>
</table>