Daedalus-1

Running on Zero

File size: 11,200 Bytes

import gradio as gr
import spaces
from transformers import pipeline, AutoTokenizer
import torch
from typing import List, Dict, Optional

# Global variable to store pipelines
model_cache = {}
tokenizer_cache = {}

# Available models
AVAILABLE_MODELS = {
    "Daedalus-1-2B": "NoemaResearch/Daedalus-1-2B",
    "Daedalus-1-8B": "NoemaResearch/Daedalus-1-8B",
}

# Models that need special token handling for repetition issues
MODELS_NEEDING_SPECIAL_HANDLING = {"Daedalus-1-8B"}

@spaces.GPU
def initialize_model(model_name):
    global model_cache, tokenizer_cache
    
    if model_name not in AVAILABLE_MODELS:
        raise ValueError(f"Model {model_name} not found in available models")
    
    model_id = AVAILABLE_MODELS[model_name]
    
    # Check if model is already cached
    if model_id not in model_cache:
        try:
            # Load tokenizer separately to handle chat template properly
            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
                model_id, 
                trust_remote_code=True
            )
            
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                tokenizer=tokenizer_cache[model_id],
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        except Exception:
            # Fallback to CPU if GPU fails
            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
                model_id, 
                trust_remote_code=True
            )
            
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                tokenizer=tokenizer_cache[model_id],
                torch_dtype=torch.float32,
                device_map="cpu",
                trust_remote_code=True
            )
    
    return model_cache[model_id], tokenizer_cache[model_id]

def format_conversation_with_template(messages: List[Dict], tokenizer) -> str:
    """Manually apply the chat template to ensure proper formatting"""
    
    # Get the chat template
    if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
        try:
            # Use the tokenizer's apply_chat_template method
            formatted = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            return formatted
        except Exception as e:
            print(f"Chat template application failed: {e}")
            # Fall back to manual formatting
            pass
    
    # Manual fallback formatting using actual special tokens
    bos_token = "<[begin▁of▁sentence]>"
    eos_token = "<[end▁of▁sentence]>"
    
    # Start with system message
    formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}"
    
    # Add each message
    for msg in messages:
        role = msg.get('role', 'user')
        content = msg.get('content', '').strip()
        formatted += f"{bos_token}{role}\n{content}{eos_token}"
    
    # Add generation prompt
    formatted += f"{bos_token}assistant\n"
    
    return formatted

@spaces.GPU
def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
    """Generate response using the selected model"""
    
    try:
        model_pipe, tokenizer = initialize_model(model_name)
    except Exception as e:
        return f"Error loading model {model_name}: {str(e)}"
    
    # Format the conversation history
    messages = []
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    messages.append({"role": "user", "content": message})
    
    try:
        # Format the conversation using the chat template
        formatted_prompt = format_conversation_with_template(messages, tokenizer)
        
        # Different generation parameters based on model
        if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
            # 8B model needs special token handling to prevent repetition
            stop_tokens = [
                "<[end▁of▁sentence]>",  # EOS token
                "<[begin▁of▁sentence]>",  # BOS token (shouldn't appear mid-generation)
                "user\n",  # Stop if model tries to continue conversation
                "system\n",  # Stop if model tries to add system messages
                "\nuser",  # Alternative format
                "\nsystem"  # Alternative format
            ]
            
            response = model_pipe(
                formatted_prompt,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=1,  # PAD token ID from config
                eos_token_id=2,  # EOS token ID from config
                bos_token_id=0,  # BOS token ID from config
                return_full_text=False,
                repetition_penalty=1.1,  # Reduce loops
                stop_sequence=stop_tokens[0]  # Primary stop token
            )
        else:
            # 2B model - standard generation without special handling
            response = model_pipe(
                formatted_prompt,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                return_full_text=False,
                repetition_penalty=1.05  # Light repetition penalty
            )
        
        if isinstance(response, list) and len(response) > 0:
            generated_text = response[0]['generated_text']
        else:
            generated_text = str(response)
        
        # Clean up the response
        assistant_response = str(generated_text).strip()
        
        # Apply different cleanup based on model
        if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
            # More aggressive cleanup for 8B model
            stop_tokens = [
                "<[end▁of▁sentence]>", "<[begin▁of▁sentence]>",
                "user\n", "system\n", "\nuser", "\nsystem"
            ]
            
            for stop_token in stop_tokens:
                if stop_token in assistant_response:
                    assistant_response = assistant_response.split(stop_token)[0].strip()
            
            # Additional cleanup for common repetition patterns
            lines = assistant_response.split('\n')
            cleaned_lines = []
            for line in lines:
                if line.strip() and not line.strip().startswith(('user', 'assistant', 'system')):
                    cleaned_lines.append(line)
            assistant_response = '\n'.join(cleaned_lines).strip()
        else:
            # Standard cleanup for 2B model
            if assistant_response.startswith("assistant\n"):
                assistant_response = assistant_response[10:].strip()
        
        return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response. Please try again."
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

def create_interface():
    with gr.Blocks(title="Daedalus-1-8B Chat", theme=gr.themes.Base(primary_hue="green")) as demo:
        gr.Markdown("""
        # 🟢 Daedalus Chat Interface
        
        Chat with **Daedalus models** by Noema Research.
        """)
        
        # Model selection dropdown
        model_dropdown = gr.Dropdown(
            choices=list(AVAILABLE_MODELS.keys()),
            value="Daedalus-1-2B",  # Default to 2B model
            label="Select Model",
            info="Choose between Daedalus-1-2B (faster) or Daedalus-1-8B (more capable)"
        )
        
        chatbot = gr.Chatbot(
            height=400,
            placeholder="Start chatting with Daedalus-1-8B...",
            label="Chat"
        )
        
        msg = gr.Textbox(
            placeholder="Type your message here...",
            label="Message",
            lines=2
        )
        
        with gr.Row():
            submit_btn = gr.Button("Send", variant="primary")
            clear_btn = gr.Button("Clear Chat", variant="secondary")
        
        with gr.Accordion("Advanced Settings", open=False):
            max_length = gr.Slider(
                minimum=200,
                maximum=4096,  # Reduced from 8192 to prevent memory issues
                value=1024,    # Reduced default from 2048
                step=50,
                label="Max New Tokens",
                info="Maximum number of new tokens to generate"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Controls randomness in generation"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.1,
                label="Top P",
                info="Controls diversity via nucleus sampling"
            )
        
        def user_message(message, history):
            return "", history + [[message, None]]
        
        def bot_response(history, selected_model, max_len, temp, top_p):
            if history:
                user_message = history[-1][0]
                bot_message = generate_response(
                    user_message, 
                    history[:-1], 
                    selected_model,  # Use selected model
                    max_len, 
                    temp, 
                    top_p
                )
                history[-1][1] = bot_message
            return history
        
        msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
        )
        
        submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
        )
        
        clear_btn.click(lambda: None, None, chatbot, queue=False)
        
        gr.Markdown("""
        ---
        
        ### About Daedalus Models
        
        **Daedalus-1-2B:** Faster, lightweight model for quick responses and basic coding tasks.
        
        **Daedalus-1-8B:** More capable model with advanced reasoning, fine-tuned for structured outputs, 
        debugging, and long-context reasoning (up to ~64K tokens).
        
        Both models are optimized for:
        - Conversational AI
        - Code generation & debugging
        - Structured JSON/function outputs
        - Multi-step reasoning
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)