Daedalus-1 / app.py
Spestly's picture
Update app.py
2410ca2 verified
import gradio as gr
import spaces
from transformers import pipeline, AutoTokenizer
import torch
from typing import List, Dict, Optional
# Global variable to store pipelines
model_cache = {}
tokenizer_cache = {}
# Available models
AVAILABLE_MODELS = {
"Daedalus-1-2B": "NoemaResearch/Daedalus-1-2B",
"Daedalus-1-8B": "NoemaResearch/Daedalus-1-8B",
}
# Models that need special token handling for repetition issues
MODELS_NEEDING_SPECIAL_HANDLING = {"Daedalus-1-8B"}
@spaces.GPU
def initialize_model(model_name):
global model_cache, tokenizer_cache
if model_name not in AVAILABLE_MODELS:
raise ValueError(f"Model {model_name} not found in available models")
model_id = AVAILABLE_MODELS[model_name]
# Check if model is already cached
if model_id not in model_cache:
try:
# Load tokenizer separately to handle chat template properly
tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True
)
model_cache[model_id] = pipeline(
"text-generation",
model=model_id,
tokenizer=tokenizer_cache[model_id],
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
except Exception:
# Fallback to CPU if GPU fails
tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True
)
model_cache[model_id] = pipeline(
"text-generation",
model=model_id,
tokenizer=tokenizer_cache[model_id],
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True
)
return model_cache[model_id], tokenizer_cache[model_id]
def format_conversation_with_template(messages: List[Dict], tokenizer) -> str:
"""Manually apply the chat template to ensure proper formatting"""
# Get the chat template
if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
try:
# Use the tokenizer's apply_chat_template method
formatted = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
return formatted
except Exception as e:
print(f"Chat template application failed: {e}")
# Fall back to manual formatting
pass
# Manual fallback formatting using actual special tokens
bos_token = "<[begin▁of▁sentence]>"
eos_token = "<[end▁of▁sentence]>"
# Start with system message
formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}"
# Add each message
for msg in messages:
role = msg.get('role', 'user')
content = msg.get('content', '').strip()
formatted += f"{bos_token}{role}\n{content}{eos_token}"
# Add generation prompt
formatted += f"{bos_token}assistant\n"
return formatted
@spaces.GPU
def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
"""Generate response using the selected model"""
try:
model_pipe, tokenizer = initialize_model(model_name)
except Exception as e:
return f"Error loading model {model_name}: {str(e)}"
# Format the conversation history
messages = []
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
try:
# Format the conversation using the chat template
formatted_prompt = format_conversation_with_template(messages, tokenizer)
# Different generation parameters based on model
if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
# 8B model needs special token handling to prevent repetition
stop_tokens = [
"<[end▁of▁sentence]>", # EOS token
"<[begin▁of▁sentence]>", # BOS token (shouldn't appear mid-generation)
"user\n", # Stop if model tries to continue conversation
"system\n", # Stop if model tries to add system messages
"\nuser", # Alternative format
"\nsystem" # Alternative format
]
response = model_pipe(
formatted_prompt,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=1, # PAD token ID from config
eos_token_id=2, # EOS token ID from config
bos_token_id=0, # BOS token ID from config
return_full_text=False,
repetition_penalty=1.1, # Reduce loops
stop_sequence=stop_tokens[0] # Primary stop token
)
else:
# 2B model - standard generation without special handling
response = model_pipe(
formatted_prompt,
max_new_tokens=max_length,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
return_full_text=False,
repetition_penalty=1.05 # Light repetition penalty
)
if isinstance(response, list) and len(response) > 0:
generated_text = response[0]['generated_text']
else:
generated_text = str(response)
# Clean up the response
assistant_response = str(generated_text).strip()
# Apply different cleanup based on model
if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
# More aggressive cleanup for 8B model
stop_tokens = [
"<[end▁of▁sentence]>", "<[begin▁of▁sentence]>",
"user\n", "system\n", "\nuser", "\nsystem"
]
for stop_token in stop_tokens:
if stop_token in assistant_response:
assistant_response = assistant_response.split(stop_token)[0].strip()
# Additional cleanup for common repetition patterns
lines = assistant_response.split('\n')
cleaned_lines = []
for line in lines:
if line.strip() and not line.strip().startswith(('user', 'assistant', 'system')):
cleaned_lines.append(line)
assistant_response = '\n'.join(cleaned_lines).strip()
else:
# Standard cleanup for 2B model
if assistant_response.startswith("assistant\n"):
assistant_response = assistant_response[10:].strip()
return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response. Please try again."
except Exception as e:
return f"Error generating response: {str(e)}"
def create_interface():
with gr.Blocks(title="Daedalus-1-8B Chat", theme=gr.themes.Base(primary_hue="green")) as demo:
gr.Markdown("""
# 🟢 Daedalus Chat Interface
Chat with **Daedalus models** by Noema Research.
""")
# Model selection dropdown
model_dropdown = gr.Dropdown(
choices=list(AVAILABLE_MODELS.keys()),
value="Daedalus-1-2B", # Default to 2B model
label="Select Model",
info="Choose between Daedalus-1-2B (faster) or Daedalus-1-8B (more capable)"
)
chatbot = gr.Chatbot(
height=400,
placeholder="Start chatting with Daedalus-1-8B...",
label="Chat"
)
msg = gr.Textbox(
placeholder="Type your message here...",
label="Message",
lines=2
)
with gr.Row():
submit_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear Chat", variant="secondary")
with gr.Accordion("Advanced Settings", open=False):
max_length = gr.Slider(
minimum=200,
maximum=4096, # Reduced from 8192 to prevent memory issues
value=1024, # Reduced default from 2048
step=50,
label="Max New Tokens",
info="Maximum number of new tokens to generate"
)
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Controls randomness in generation"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1,
label="Top P",
info="Controls diversity via nucleus sampling"
)
def user_message(message, history):
return "", history + [[message, None]]
def bot_response(history, selected_model, max_len, temp, top_p):
if history:
user_message = history[-1][0]
bot_message = generate_response(
user_message,
history[:-1],
selected_model, # Use selected model
max_len,
temp,
top_p
)
history[-1][1] = bot_message
return history
msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
)
submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
)
clear_btn.click(lambda: None, None, chatbot, queue=False)
gr.Markdown("""
---
### About Daedalus Models
**Daedalus-1-2B:** Faster, lightweight model for quick responses and basic coding tasks.
**Daedalus-1-8B:** More capable model with advanced reasoning, fine-tuned for structured outputs,
debugging, and long-context reasoning (up to ~64K tokens).
Both models are optimized for:
- Conversational AI
- Code generation & debugging
- Structured JSON/function outputs
- Multi-step reasoning
""")
return demo
# Launch the app
if __name__ == "__main__":
demo = create_interface()
demo.launch(share=True)