Claude AI
Update to Llama 3.1 405B and improve UI
ce91b95
import gradio as gr
import json
import time
from typing import Dict, Tuple, List
from bertmodel import predict_label
# from ecologits import EcoLogits # Removed - using OpenRouter instead
# from openai import OpenAI # Removed - using OpenRouter instead
from dotenv import load_dotenv
import os
import requests
import json
# Set environment variable to suppress tokenizers warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"
load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
# Model configurations with energy consumption and cost estimates
MODEL_CONFIGS = {
"large": {
"name": "Llama 3.1 405B",
"energy_per_token": 0.238, # Wh per token (11.9 Wh / 50 tokens)
"cost_per_input_token": 0.000003, # $3/M tokens (OpenRouter pricing)
"cost_per_output_token": 0.000003, # $3/M tokens (OpenRouter pricing)
"icon": "πŸ¦™"
},
"small": {
"name": "Mistral Small 24B",
"energy_per_token": 0.00596, # Wh per token (0.298 Wh / 50 tokens)
"cost_per_input_token": 0.00000005, # $0.05/M tokens
"cost_per_output_token": 0.00000012, # $0.12/M tokens
"icon": "⚑"
}
}
class ModelRouter:
def __init__(self):
self.routing_history = []
print("[INIT] ModelRouter initialized")
def classify_prompt(self, prompt: str) -> str:
print(f"\n[CLASSIFY] Classifying prompt: '{prompt[:50]}...'")
label = predict_label(prompt)
print(f"[CLASSIFY] ModernBERT returned label: '{label}'")
return label
def select_model(self, prompt: str) -> str:
"""Select the most efficient model based on prompt classification."""
prompt_type = self.classify_prompt(prompt)
# Normalize
key = prompt_type.strip().lower()
print(f"[SELECT] Normalized label: '{key}'")
# Map normalized labels to actual MODEL_CONFIGS keys
if "small" in key:
print(f"[SELECT] Selected: SMALL model (Mistral Small 24B)")
return "small"
else:
print(f"[SELECT] Selected: LARGE model (Claude Opus 4)")
return "large"
def estimate_tokens(self,
prompt: str,
response: str | None = None,
max_response_tokens: int | None = None) -> int:
"""
Estimate total token count: exact prompt tokens +
a target number of response tokens.
"""
# Simple estimation: 4 characters = 1 token
prompt_tokens = len(prompt) // 4
print(f"[TOKENS] Prompt tokens: {prompt_tokens} (from {len(prompt)} chars)")
if response is not None:
response_tokens = len(response) // 4
elif max_response_tokens is not None:
# you’re reserving this many tokens for the model’s reply
response_tokens = max_response_tokens
else:
# Estimate response will be similar length to prompt
response_tokens = prompt_tokens
total_tokens = prompt_tokens + response_tokens
print(f"[TOKENS] Response tokens: {response_tokens}, Total: {total_tokens}")
return total_tokens
def estimate_large_model_energy(self, tokens: int) -> float:
"""
Estimate large model energy consumption based on tokens.
Using empirical estimates for energy consumption.
"""
large_config = MODEL_CONFIGS["large"]
return tokens * large_config["energy_per_token"]
def calculate_savings(self, selected_model: str, prompt: str, response: str = None) -> Dict:
"""Calculate energy and cost savings compared to using the large model"""
print(f"[SAVINGS] Calculating for model: {selected_model}")
# Calculate input and output tokens separately
input_tokens = max(1, len(prompt) // 4) # Minimum 1 token
if response:
# Use actual response length if available
output_tokens = max(1, len(response) // 4)
else:
# Estimate if no response yet (for preview)
output_tokens = max(10, input_tokens) # Assume at least 10 tokens response
total_tokens = input_tokens + output_tokens
print(f"[SAVINGS] Input tokens: {input_tokens}, Output tokens: {output_tokens}")
selected_config = MODEL_CONFIGS[selected_model]
large_config = MODEL_CONFIGS["large"]
# Calculate actual usage
actual_energy = total_tokens * selected_config["energy_per_token"]
actual_cost = (input_tokens * selected_config["cost_per_input_token"] +
output_tokens * selected_config["cost_per_output_token"])
# Calculate large model usage
large_energy = self.estimate_large_model_energy(total_tokens)
large_cost = (input_tokens * large_config["cost_per_input_token"] +
output_tokens * large_config["cost_per_output_token"])
# Calculate savings (only positive if small model is selected)
if selected_model == "small":
energy_saved = large_energy - actual_energy
cost_saved = large_cost - actual_cost
energy_saved_percent = (energy_saved / large_energy) * 100 if large_energy > 0 else 0
cost_saved_percent = (cost_saved / large_cost) * 100 if large_cost > 0 else 0
else:
# No savings if using the large model
energy_saved = 0
cost_saved = 0
energy_saved_percent = 0
cost_saved_percent = 0
print(f"[SAVINGS] Selected: {selected_model}")
print(f"[SAVINGS] Actual energy: {actual_energy:.4f} Wh, Large energy: {large_energy:.4f} Wh")
print(f"[SAVINGS] Actual cost: ${actual_cost:.8f}, Large cost: ${large_cost:.8f}")
print(f"[SAVINGS] Energy saved: {energy_saved:.4f} Wh ({energy_saved_percent:.1f}%)")
print(f"[SAVINGS] Cost saved: ${cost_saved:.8f} ({cost_saved_percent:.1f}%)")
return {
"selected_model": selected_config["name"],
"tokens": total_tokens,
"actual_energy": actual_energy,
"actual_cost": actual_cost,
"large_energy": large_energy,
"large_cost": large_cost,
"energy_saved": energy_saved,
"cost_saved": cost_saved,
"energy_saved_percent": energy_saved_percent,
"cost_saved_percent": cost_saved_percent,
"is_large_model": selected_model == "large" # Add flag for template
}
print("[STARTUP] Initializing ModelRouter...")
router = ModelRouter()
print("[STARTUP] ModelRouter ready")
print(f"[STARTUP] Available models: {list(MODEL_CONFIGS.keys())}")
print(f"[STARTUP] OpenRouter API Key: {'SET' if OPENROUTER_API_KEY else 'NOT SET'}")
def process_message(message: str, history: List[List[str]]) -> Tuple[str, str, str]:
"""Process the user message and return response with savings info"""
print(f"\n{'='*60}")
print(f"[PROCESS] New message received: '{message[:100]}...'")
# Route to appropriate model
selected_model = router.select_model(message)
model_config = MODEL_CONFIGS[selected_model]
print(f"[PROCESS] Using model config: {model_config['name']}")
# Initial savings estimate (will be recalculated after getting response)
print(f"[PROCESS] Calculating initial savings estimate...")
initial_savings = router.calculate_savings(selected_model, message)
print(f"[PROCESS] Initial estimate: {initial_savings['energy_saved_percent']:.1f}% energy, {initial_savings['cost_saved_percent']:.1f}% cost")
open_router_model_dict = {
"large": "meta-llama/llama-3.1-405b-instruct",
"small": "mistralai/mistral-small-24b-instruct-2501"
}
# Check if API key is available
if not OPENROUTER_API_KEY:
print(f"[API] No OpenRouter API key found - running in DEMO MODE")
answer = f"[Demo Mode] This would be a response from {model_config['name']} to: {message[:50]}..."
else:
print(f"[API] OpenRouter API key found: {OPENROUTER_API_KEY[:10]}...")
try:
model_id = open_router_model_dict[selected_model]
print(f"[API] Calling OpenRouter with model: {model_id}")
request_data = {
"model": model_id,
"messages": [
{
"role": "user",
"content": message
}
]
}
print(f"[API] Request data: {json.dumps(request_data, indent=2)[:200]}...")
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json"
},
data=json.dumps(request_data)
)
# Debug: print response status and content
print(f"[API] Response Status Code: {response.status_code}")
print(f"[API] Response Headers: {dict(response.headers)}")
if response.status_code != 200:
print(f"[API ERROR] Full response: {response.text}")
answer = f"[API Error {response.status_code}] {response.text[:200]}..."
else:
data = response.json()
print(f"[API] Response keys: {list(data.keys())}")
if "choices" in data and len(data["choices"]) > 0:
answer = data["choices"][0]["message"]["content"]
print(f"[API] Successfully got response: {answer[:100]}...")
else:
print(f"[API ERROR] Unexpected response format: {json.dumps(data, indent=2)}")
answer = f"[Error] Unexpected response format from OpenRouter API"
except Exception as e:
print(f"[API EXCEPTION] Error type: {type(e).__name__}")
print(f"[API EXCEPTION] Error message: {str(e)}")
import traceback
print(f"[API EXCEPTION] Traceback:\n{traceback.format_exc()}")
answer = f"[Error] Failed to get response from {model_config['name']}. Error: {str(e)}"
# Recalculate savings with actual response
print(f"[PROCESS] Recalculating savings with actual response...")
savings = router.calculate_savings(selected_model, message, answer)
print(f"[PROCESS] Final savings: {savings['energy_saved_percent']:.1f}% energy, {savings['cost_saved_percent']:.1f}% cost")
# Format the response with model info
response = f"{answer}\n\n<div style='background: #f0f9ff; border-left: 3px solid #0ea5e9; padding: 8px 12px; margin-top: 10px; border-radius: 4px;'><small style='color: #0369a1; font-weight: 500;'>{model_config['icon']} Answered by {model_config['name']}</small></div>"
# Format model info
model_info = f"""
<div style="background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); padding: 20px; border-radius: 12px; margin-bottom: 20px;">
<div style="display: flex; align-items: center; margin-bottom: 10px;">
<span style="font-size: 2em; margin-right: 10px;">{model_config['icon']}</span>
<h3 style="margin: 0; color: #2c3e50;">{model_config['name']}</h3>
</div>
<p style="color: #5a6c7d; margin: 5px 0;">Optimal model selected for your query</p>
</div>
"""
# Format savings information with conditional display based on model
if savings['is_large_model']:
# Show actual consumption for large model with warning colors
savings_info = f"""
<div style="background: #ffffff; border: 1px solid #fed7aa; border-radius: 12px; padding: 20px;">
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
<div>
<p style="color: #8795a1; margin: 0; font-size: 0.9em;">πŸ”₯ Energy Consumption</p>
<p style="color: #ea580c; font-size: 1.5em; font-weight: bold; margin: 5px 0;">
{savings['actual_energy']:.1f} Wh
</p>
<p style="color: #7c2d12; font-size: 0.85em; margin: 0;">
High energy usage
</p>
</div>
<div>
<p style="color: #8795a1; margin: 0; font-size: 0.9em;">πŸ’Έ Cost Impact</p>
<p style="color: #dc2626; font-size: 1.5em; font-weight: bold; margin: 5px 0;">
${savings['actual_cost']:.6f}
</p>
<p style="color: #991b1b; font-size: 0.85em; margin: 0;">
Premium pricing
</p>
</div>
</div>
</div>
"""
else:
# Show savings for small model with positive colors
savings_info = f"""
<div style="background: #ffffff; border: 1px solid #e1e8ed; border-radius: 12px; padding: 20px;">
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
<div>
<p style="color: #8795a1; margin: 0; font-size: 0.9em;">⚑ Energy Efficiency</p>
<p style="color: #22c55e; font-size: 1.5em; font-weight: bold; margin: 5px 0;">
{savings['energy_saved_percent']:.1f}% saved
</p>
<p style="color: #5a6c7d; font-size: 0.85em; margin: 0;">
{savings['energy_saved']:.1f} Wh reduction
</p>
<p style="color: #8795a1; font-size: 0.75em; margin: 3px 0 0 0; font-style: italic;">
vs. using large model
</p>
</div>
<div>
<p style="color: #8795a1; margin: 0; font-size: 0.9em;">πŸ’° Cost Optimization</p>
<p style="color: #3b82f6; font-size: 1.5em; font-weight: bold; margin: 5px 0;">
{savings['cost_saved_percent']:.1f}% saved
</p>
<p style="color: #5a6c7d; font-size: 0.85em; margin: 0;">
${savings['cost_saved']:.8f} reduction
</p>
<p style="color: #8795a1; font-size: 0.75em; margin: 3px 0 0 0; font-style: italic;">
vs. using large model
</p>
</div>
</div>
</div>
"""
# Add to routing history
router.routing_history.append({
"timestamp": time.time(),
"prompt": message,
"model": selected_model,
"savings": savings
})
print(f"[PROCESS] Response formatted, returning to UI")
print(f"{'='*60}\n")
return response, model_info, savings_info
def get_statistics() -> str:
"""Get cumulative statistics from routing history"""
if not router.routing_history:
return """
<div style="background: #f8fafc; border-radius: 12px; padding: 30px; text-align: center; color: #64748b;">
<p style="margin: 0;">No queries processed yet</p>
<p style="margin: 10px 0 0 0; font-size: 0.9em;">πŸ’¬ Start a conversation to see your impact metrics</p>
</div>
"""
total_queries = len(router.routing_history)
# Calculate user's total savings
user_total_energy_saved = sum(entry["savings"]["energy_saved"] for entry in router.routing_history)
user_total_cost_saved = sum(entry["savings"]["cost_saved"] for entry in router.routing_history)
# Count how many times each model was used
small_model_count = sum(1 for entry in router.routing_history if entry["model"] == "small")
large_model_count = sum(1 for entry in router.routing_history if entry["model"] == "large")
stats = f"""
<div style="background: #ffffff; border: 1px solid #e2e8f0; border-radius: 12px; padding: 25px;">
<div style="text-align: center; margin-bottom: 20px;">
<h4 style="color: #1e293b; font-size: 1.1em; margin: 0; font-weight: 600;">🌍 Your Total Impact</h4>
</div>
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px; margin-bottom: 15px;">
<div style="background: #f0fdf4; border-radius: 8px; padding: 15px; text-align: center;">
<p style="color: #166534; font-size: 0.9em; margin: 0;">🌱 Energy Saved</p>
<p style="color: #15803d; font-size: 1.5em; font-weight: bold; margin: 5px 0;">
{user_total_energy_saved:.1f}
</p>
<p style="color: #166534; font-size: 0.8em; margin: 0;">Wh</p>
</div>
<div style="background: #eff6ff; border-radius: 8px; padding: 15px; text-align: center;">
<p style="color: #1e40af; font-size: 0.9em; margin: 0;">πŸ’΅ Money Saved</p>
<p style="color: #2563eb; font-size: 1.5em; font-weight: bold; margin: 5px 0;">
${user_total_cost_saved:.6f}
</p>
<p style="color: #1e40af; font-size: 0.8em; margin: 0;">USD</p>
</div>
</div>
<div style="background: #fefce8; border-radius: 8px; padding: 12px; text-align: center;">
<p style="color: #713f12; font-size: 0.9em; margin: 0;">
<span style="font-weight: 600;">Model Usage:</span> Small model {small_model_count}x, Large model {large_model_count}x
</p>
</div>
</div>
"""
return stats
# Custom CSS for a more professional look
custom_css = """
.gradio-container {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica', 'Arial', sans-serif;
}
.message {
padding: 12px 16px !important;
border-radius: 8px !important;
}
"""
# Create Gradio interface
with gr.Blocks(
title="Do I really need a huge LLM?",
theme=gr.themes.Base(
primary_hue="blue",
secondary_hue="gray",
neutral_hue="gray",
font=["Inter", "system-ui", "sans-serif"]
),
css=custom_css
) as demo:
with gr.Row():
with gr.Column(scale=3):
gr.Markdown("""
<div style="margin-bottom: 30px;">
<h1 style="margin: 0; font-size: 2em; font-weight: 600; color: #0f172a;">
πŸ€” Do I *really* need a huge LLM?
</h1>
<p style="margin: 10px 0 0 0; color: #64748b; font-size: 1.1em;">
Let's find out! This tool automatically routes your queries to the right-sized model. 🎯
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.Chatbot(
height=500,
show_label=False,
container=True,
elem_classes=["chat-container"]
)
with gr.Row():
msg = gr.Textbox(
placeholder="πŸ’­ Type your message here...",
show_label=False,
scale=9,
container=False,
elem_classes=["message-input"]
)
submit = gr.Button(
"Send πŸš€",
variant="primary",
scale=1,
min_width=100
)
with gr.Column(scale=2):
# Model selection display
model_display = gr.HTML(
value="""
<div style="background: #f8fafc; border-radius: 12px; padding: 20px; text-align: center; color: #64748b;">
<p style="margin: 0;">πŸ€– Model selection will appear here</p>
</div>
""",
label="Selected Model"
)
# Savings metrics
savings_display = gr.HTML(
value="""
<div style="background: #f8fafc; border-radius: 12px; padding: 20px; text-align: center; color: #64748b;">
<p style="margin: 0;">πŸ“Š Efficiency metrics will appear here</p>
</div>
""",
label="Efficiency Metrics"
)
# Cumulative stats
stats_display = gr.HTML(
value=get_statistics(),
label="Your Impact Dashboard"
)
# Footer with minimal info
with gr.Row():
gr.Markdown("""
<div style="margin-top: 40px; padding-top: 20px; border-top: 1px solid #e2e8f0; text-align: center; color: #94a3b8; font-size: 0.85em;">
<p style="margin: 5px 0;">πŸ” Comparing small vs large model efficiency β€’ πŸ“ˆ Real-time tracking β€’ 🌎 Environmental impact monitoring</p>
</div>
""")
def respond(message, chat_history):
response, model_info, savings = process_message(message, chat_history)
chat_history.append((message, response))
return "", chat_history, model_info, savings, get_statistics()
msg.submit(respond, [msg, chatbot], [msg, chatbot, model_display, savings_display, stats_display])
submit.click(respond, [msg, chatbot], [msg, chatbot, model_display, savings_display, stats_display])
# Clear button functionality
def clear_chat():
return None, """
<div style="background: #f8fafc; border-radius: 12px; padding: 20px; text-align: center; color: #64748b;">
<p style="margin: 0;">Model selection will appear here</p>
</div>
""", """
<div style="background: #f8fafc; border-radius: 12px; padding: 20px; text-align: center; color: #64748b;">
<p style="margin: 0;">Efficiency metrics will appear here</p>
</div>
""", get_statistics()
# Add clear functionality to the Enter key
msg.submit(lambda: "", outputs=[msg])
if __name__ == "__main__":
print(f"\n{'='*60}")
print(f" DO I REALLY NEED A HUGE LLM? - STARTUP")
print(f"{'='*60}")
print(f"[LAUNCH] Starting Gradio app...")
print(f"[LAUNCH] Environment: TOKENIZERS_PARALLELISM={os.environ.get('TOKENIZERS_PARALLELISM')}")
print(f"[LAUNCH] Models configured:")
for k, v in MODEL_CONFIGS.items():
print(f" - {k}: {v['name']} ({v['icon']})")
print(f"[LAUNCH] OpenRouter API Key: {'βœ“ SET' if OPENROUTER_API_KEY else 'βœ— NOT SET (Demo Mode)'}")
print(f"{'='*60}\n")
demo.launch(share=False)