import gradio as gr import spaces from transformers import pipeline, AutoTokenizer import torch from typing import List, Dict, Optional # Global variable to store pipelines model_cache = {} tokenizer_cache = {} # Available models AVAILABLE_MODELS = { "Daedalus-1-2B": "NoemaResearch/Daedalus-1-2B", "Daedalus-1-8B": "NoemaResearch/Daedalus-1-8B", } # Models that need special token handling for repetition issues MODELS_NEEDING_SPECIAL_HANDLING = {"Daedalus-1-8B"} @spaces.GPU def initialize_model(model_name): global model_cache, tokenizer_cache if model_name not in AVAILABLE_MODELS: raise ValueError(f"Model {model_name} not found in available models") model_id = AVAILABLE_MODELS[model_name] # Check if model is already cached if model_id not in model_cache: try: # Load tokenizer separately to handle chat template properly tokenizer_cache[model_id] = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True ) model_cache[model_id] = pipeline( "text-generation", model=model_id, tokenizer=tokenizer_cache[model_id], torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) except Exception: # Fallback to CPU if GPU fails tokenizer_cache[model_id] = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True ) model_cache[model_id] = pipeline( "text-generation", model=model_id, tokenizer=tokenizer_cache[model_id], torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True ) return model_cache[model_id], tokenizer_cache[model_id] def format_conversation_with_template(messages: List[Dict], tokenizer) -> str: """Manually apply the chat template to ensure proper formatting""" # Get the chat template if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template: try: # Use the tokenizer's apply_chat_template method formatted = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) return formatted except Exception as e: print(f"Chat template application failed: {e}") # Fall back to manual formatting pass # Manual fallback formatting using actual special tokens bos_token = "<[begin▁of▁sentence]>" eos_token = "<[end▁of▁sentence]>" # Start with system message formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}" # Add each message for msg in messages: role = msg.get('role', 'user') content = msg.get('content', '').strip() formatted += f"{bos_token}{role}\n{content}{eos_token}" # Add generation prompt formatted += f"{bos_token}assistant\n" return formatted @spaces.GPU def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9): """Generate response using the selected model""" try: model_pipe, tokenizer = initialize_model(model_name) except Exception as e: return f"Error loading model {model_name}: {str(e)}" # Format the conversation history messages = [] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) try: # Format the conversation using the chat template formatted_prompt = format_conversation_with_template(messages, tokenizer) # Different generation parameters based on model if model_name in MODELS_NEEDING_SPECIAL_HANDLING: # 8B model needs special token handling to prevent repetition stop_tokens = [ "<[end▁of▁sentence]>", # EOS token "<[begin▁of▁sentence]>", # BOS token (shouldn't appear mid-generation) "user\n", # Stop if model tries to continue conversation "system\n", # Stop if model tries to add system messages "\nuser", # Alternative format "\nsystem" # Alternative format ] response = model_pipe( formatted_prompt, max_new_tokens=max_length, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=1, # PAD token ID from config eos_token_id=2, # EOS token ID from config bos_token_id=0, # BOS token ID from config return_full_text=False, repetition_penalty=1.1, # Reduce loops stop_sequence=stop_tokens[0] # Primary stop token ) else: # 2B model - standard generation without special handling response = model_pipe( formatted_prompt, max_new_tokens=max_length, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, return_full_text=False, repetition_penalty=1.05 # Light repetition penalty ) if isinstance(response, list) and len(response) > 0: generated_text = response[0]['generated_text'] else: generated_text = str(response) # Clean up the response assistant_response = str(generated_text).strip() # Apply different cleanup based on model if model_name in MODELS_NEEDING_SPECIAL_HANDLING: # More aggressive cleanup for 8B model stop_tokens = [ "<[end▁of▁sentence]>", "<[begin▁of▁sentence]>", "user\n", "system\n", "\nuser", "\nsystem" ] for stop_token in stop_tokens: if stop_token in assistant_response: assistant_response = assistant_response.split(stop_token)[0].strip() # Additional cleanup for common repetition patterns lines = assistant_response.split('\n') cleaned_lines = [] for line in lines: if line.strip() and not line.strip().startswith(('user', 'assistant', 'system')): cleaned_lines.append(line) assistant_response = '\n'.join(cleaned_lines).strip() else: # Standard cleanup for 2B model if assistant_response.startswith("assistant\n"): assistant_response = assistant_response[10:].strip() return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response. Please try again." except Exception as e: return f"Error generating response: {str(e)}" def create_interface(): with gr.Blocks(title="Daedalus-1-8B Chat", theme=gr.themes.Base(primary_hue="green")) as demo: gr.Markdown(""" # 🟢 Daedalus Chat Interface Chat with **Daedalus models** by Noema Research. """) # Model selection dropdown model_dropdown = gr.Dropdown( choices=list(AVAILABLE_MODELS.keys()), value="Daedalus-1-2B", # Default to 2B model label="Select Model", info="Choose between Daedalus-1-2B (faster) or Daedalus-1-8B (more capable)" ) chatbot = gr.Chatbot( height=400, placeholder="Start chatting with Daedalus-1-8B...", label="Chat" ) msg = gr.Textbox( placeholder="Type your message here...", label="Message", lines=2 ) with gr.Row(): submit_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear Chat", variant="secondary") with gr.Accordion("Advanced Settings", open=False): max_length = gr.Slider( minimum=200, maximum=4096, # Reduced from 8192 to prevent memory issues value=1024, # Reduced default from 2048 step=50, label="Max New Tokens", info="Maximum number of new tokens to generate" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Controls randomness in generation" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top P", info="Controls diversity via nucleus sampling" ) def user_message(message, history): return "", history + [[message, None]] def bot_response(history, selected_model, max_len, temp, top_p): if history: user_message = history[-1][0] bot_message = generate_response( user_message, history[:-1], selected_model, # Use selected model max_len, temp, top_p ) history[-1][1] = bot_message return history msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then( bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot ) submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then( bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot ) clear_btn.click(lambda: None, None, chatbot, queue=False) gr.Markdown(""" --- ### About Daedalus Models **Daedalus-1-2B:** Faster, lightweight model for quick responses and basic coding tasks. **Daedalus-1-8B:** More capable model with advanced reasoning, fine-tuned for structured outputs, debugging, and long-context reasoning (up to ~64K tokens). Both models are optimized for: - Conversational AI - Code generation & debugging - Structured JSON/function outputs - Multi-step reasoning """) return demo # Launch the app if __name__ == "__main__": demo = create_interface() demo.launch(share=True)