import gradio as gr import json import time from typing import Dict, Tuple, List from bertmodel import predict_label # from ecologits import EcoLogits # Removed - using OpenRouter instead # from openai import OpenAI # Removed - using OpenRouter instead from dotenv import load_dotenv import os import requests import json # Set environment variable to suppress tokenizers warning os.environ["TOKENIZERS_PARALLELISM"] = "false" load_dotenv() OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") # Model configurations with energy consumption and cost estimates MODEL_CONFIGS = { "large": { "name": "Llama 3.1 405B", "energy_per_token": 0.238, # Wh per token (11.9 Wh / 50 tokens) "cost_per_input_token": 0.000003, # $3/M tokens (OpenRouter pricing) "cost_per_output_token": 0.000003, # $3/M tokens (OpenRouter pricing) "icon": "π¦" }, "small": { "name": "Mistral Small 24B", "energy_per_token": 0.00596, # Wh per token (0.298 Wh / 50 tokens) "cost_per_input_token": 0.00000005, # $0.05/M tokens "cost_per_output_token": 0.00000012, # $0.12/M tokens "icon": "β‘" } } class ModelRouter: def __init__(self): self.routing_history = [] print("[INIT] ModelRouter initialized") def classify_prompt(self, prompt: str) -> str: print(f"\n[CLASSIFY] Classifying prompt: '{prompt[:50]}...'") label = predict_label(prompt) print(f"[CLASSIFY] ModernBERT returned label: '{label}'") return label def select_model(self, prompt: str) -> str: """Select the most efficient model based on prompt classification.""" prompt_type = self.classify_prompt(prompt) # Normalize key = prompt_type.strip().lower() print(f"[SELECT] Normalized label: '{key}'") # Map normalized labels to actual MODEL_CONFIGS keys if "small" in key: print(f"[SELECT] Selected: SMALL model (Mistral Small 24B)") return "small" else: print(f"[SELECT] Selected: LARGE model (Claude Opus 4)") return "large" def estimate_tokens(self, prompt: str, response: str | None = None, max_response_tokens: int | None = None) -> int: """ Estimate total token count: exact prompt tokens + a target number of response tokens. """ # Simple estimation: 4 characters = 1 token prompt_tokens = len(prompt) // 4 print(f"[TOKENS] Prompt tokens: {prompt_tokens} (from {len(prompt)} chars)") if response is not None: response_tokens = len(response) // 4 elif max_response_tokens is not None: # youβre reserving this many tokens for the modelβs reply response_tokens = max_response_tokens else: # Estimate response will be similar length to prompt response_tokens = prompt_tokens total_tokens = prompt_tokens + response_tokens print(f"[TOKENS] Response tokens: {response_tokens}, Total: {total_tokens}") return total_tokens def estimate_large_model_energy(self, tokens: int) -> float: """ Estimate large model energy consumption based on tokens. Using empirical estimates for energy consumption. """ large_config = MODEL_CONFIGS["large"] return tokens * large_config["energy_per_token"] def calculate_savings(self, selected_model: str, prompt: str, response: str = None) -> Dict: """Calculate energy and cost savings compared to using the large model""" print(f"[SAVINGS] Calculating for model: {selected_model}") # Calculate input and output tokens separately input_tokens = max(1, len(prompt) // 4) # Minimum 1 token if response: # Use actual response length if available output_tokens = max(1, len(response) // 4) else: # Estimate if no response yet (for preview) output_tokens = max(10, input_tokens) # Assume at least 10 tokens response total_tokens = input_tokens + output_tokens print(f"[SAVINGS] Input tokens: {input_tokens}, Output tokens: {output_tokens}") selected_config = MODEL_CONFIGS[selected_model] large_config = MODEL_CONFIGS["large"] # Calculate actual usage actual_energy = total_tokens * selected_config["energy_per_token"] actual_cost = (input_tokens * selected_config["cost_per_input_token"] + output_tokens * selected_config["cost_per_output_token"]) # Calculate large model usage large_energy = self.estimate_large_model_energy(total_tokens) large_cost = (input_tokens * large_config["cost_per_input_token"] + output_tokens * large_config["cost_per_output_token"]) # Calculate savings (only positive if small model is selected) if selected_model == "small": energy_saved = large_energy - actual_energy cost_saved = large_cost - actual_cost energy_saved_percent = (energy_saved / large_energy) * 100 if large_energy > 0 else 0 cost_saved_percent = (cost_saved / large_cost) * 100 if large_cost > 0 else 0 else: # No savings if using the large model energy_saved = 0 cost_saved = 0 energy_saved_percent = 0 cost_saved_percent = 0 print(f"[SAVINGS] Selected: {selected_model}") print(f"[SAVINGS] Actual energy: {actual_energy:.4f} Wh, Large energy: {large_energy:.4f} Wh") print(f"[SAVINGS] Actual cost: ${actual_cost:.8f}, Large cost: ${large_cost:.8f}") print(f"[SAVINGS] Energy saved: {energy_saved:.4f} Wh ({energy_saved_percent:.1f}%)") print(f"[SAVINGS] Cost saved: ${cost_saved:.8f} ({cost_saved_percent:.1f}%)") return { "selected_model": selected_config["name"], "tokens": total_tokens, "actual_energy": actual_energy, "actual_cost": actual_cost, "large_energy": large_energy, "large_cost": large_cost, "energy_saved": energy_saved, "cost_saved": cost_saved, "energy_saved_percent": energy_saved_percent, "cost_saved_percent": cost_saved_percent, "is_large_model": selected_model == "large" # Add flag for template } print("[STARTUP] Initializing ModelRouter...") router = ModelRouter() print("[STARTUP] ModelRouter ready") print(f"[STARTUP] Available models: {list(MODEL_CONFIGS.keys())}") print(f"[STARTUP] OpenRouter API Key: {'SET' if OPENROUTER_API_KEY else 'NOT SET'}") def process_message(message: str, history: List[List[str]]) -> Tuple[str, str, str]: """Process the user message and return response with savings info""" print(f"\n{'='*60}") print(f"[PROCESS] New message received: '{message[:100]}...'") # Route to appropriate model selected_model = router.select_model(message) model_config = MODEL_CONFIGS[selected_model] print(f"[PROCESS] Using model config: {model_config['name']}") # Initial savings estimate (will be recalculated after getting response) print(f"[PROCESS] Calculating initial savings estimate...") initial_savings = router.calculate_savings(selected_model, message) print(f"[PROCESS] Initial estimate: {initial_savings['energy_saved_percent']:.1f}% energy, {initial_savings['cost_saved_percent']:.1f}% cost") open_router_model_dict = { "large": "meta-llama/llama-3.1-405b-instruct", "small": "mistralai/mistral-small-24b-instruct-2501" } # Check if API key is available if not OPENROUTER_API_KEY: print(f"[API] No OpenRouter API key found - running in DEMO MODE") answer = f"[Demo Mode] This would be a response from {model_config['name']} to: {message[:50]}..." else: print(f"[API] OpenRouter API key found: {OPENROUTER_API_KEY[:10]}...") try: model_id = open_router_model_dict[selected_model] print(f"[API] Calling OpenRouter with model: {model_id}") request_data = { "model": model_id, "messages": [ { "role": "user", "content": message } ] } print(f"[API] Request data: {json.dumps(request_data, indent=2)[:200]}...") response = requests.post( url="https://openrouter.ai/api/v1/chat/completions", headers={ "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" }, data=json.dumps(request_data) ) # Debug: print response status and content print(f"[API] Response Status Code: {response.status_code}") print(f"[API] Response Headers: {dict(response.headers)}") if response.status_code != 200: print(f"[API ERROR] Full response: {response.text}") answer = f"[API Error {response.status_code}] {response.text[:200]}..." else: data = response.json() print(f"[API] Response keys: {list(data.keys())}") if "choices" in data and len(data["choices"]) > 0: answer = data["choices"][0]["message"]["content"] print(f"[API] Successfully got response: {answer[:100]}...") else: print(f"[API ERROR] Unexpected response format: {json.dumps(data, indent=2)}") answer = f"[Error] Unexpected response format from OpenRouter API" except Exception as e: print(f"[API EXCEPTION] Error type: {type(e).__name__}") print(f"[API EXCEPTION] Error message: {str(e)}") import traceback print(f"[API EXCEPTION] Traceback:\n{traceback.format_exc()}") answer = f"[Error] Failed to get response from {model_config['name']}. Error: {str(e)}" # Recalculate savings with actual response print(f"[PROCESS] Recalculating savings with actual response...") savings = router.calculate_savings(selected_model, message, answer) print(f"[PROCESS] Final savings: {savings['energy_saved_percent']:.1f}% energy, {savings['cost_saved_percent']:.1f}% cost") # Format the response with model info response = f"{answer}\n\n
Optimal model selected for your query
π₯ Energy Consumption
{savings['actual_energy']:.1f} Wh
High energy usage
πΈ Cost Impact
${savings['actual_cost']:.6f}
Premium pricing
β‘ Energy Efficiency
{savings['energy_saved_percent']:.1f}% saved
{savings['energy_saved']:.1f} Wh reduction
vs. using large model
π° Cost Optimization
{savings['cost_saved_percent']:.1f}% saved
${savings['cost_saved']:.8f} reduction
vs. using large model
No queries processed yet
π¬ Start a conversation to see your impact metrics
π± Energy Saved
{user_total_energy_saved:.1f}
Wh
π΅ Money Saved
${user_total_cost_saved:.6f}
USD
Model Usage: Small model {small_model_count}x, Large model {large_model_count}x
Let's find out! This tool automatically routes your queries to the right-sized model. π―
π€ Model selection will appear here
π Efficiency metrics will appear here
π Comparing small vs large model efficiency β’ π Real-time tracking β’ π Environmental impact monitoring
Model selection will appear here
Efficiency metrics will appear here