Daedalus-1

Running on Zero

App Files Files Community

Daedalus-1 / app.py

Spestly

Update app.py

2410ca2 verified 12 days ago

raw

history blame contribute delete

11.2 kB

	import gradio as gr
	import spaces
	from transformers import pipeline, AutoTokenizer
	import torch
	from typing import List, Dict, Optional

	# Global variable to store pipelines
	model_cache = {}
	tokenizer_cache = {}

	# Available models
	AVAILABLE_MODELS = {
	"Daedalus-1-2B": "NoemaResearch/Daedalus-1-2B",
	"Daedalus-1-8B": "NoemaResearch/Daedalus-1-8B",
	}

	# Models that need special token handling for repetition issues
	MODELS_NEEDING_SPECIAL_HANDLING = {"Daedalus-1-8B"}

	@spaces.GPU
	def initialize_model(model_name):
	global model_cache, tokenizer_cache

	if model_name not in AVAILABLE_MODELS:
	raise ValueError(f"Model {model_name} not found in available models")

	model_id = AVAILABLE_MODELS[model_name]

	# Check if model is already cached
	if model_id not in model_cache:
	try:
	# Load tokenizer separately to handle chat template properly
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True
	)

	model_cache[model_id] = pipeline(
	"text-generation",
	model=model_id,
	tokenizer=tokenizer_cache[model_id],
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	except Exception:
	# Fallback to CPU if GPU fails
	tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True
	)

	model_cache[model_id] = pipeline(
	"text-generation",
	model=model_id,
	tokenizer=tokenizer_cache[model_id],
	torch_dtype=torch.float32,
	device_map="cpu",
	trust_remote_code=True
	)

	return model_cache[model_id], tokenizer_cache[model_id]

	def format_conversation_with_template(messages: List[Dict], tokenizer) -> str:
	"""Manually apply the chat template to ensure proper formatting"""

	# Get the chat template
	if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
	try:
	# Use the tokenizer's apply_chat_template method
	formatted = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	return formatted
	except Exception as e:
	print(f"Chat template application failed: {e}")
	# Fall back to manual formatting
	pass

	# Manual fallback formatting using actual special tokens
	bos_token = "<[begin▁of▁sentence]>"
	eos_token = "<[end▁of▁sentence]>"

	# Start with system message
	formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}"

	# Add each message
	for msg in messages:
	role = msg.get('role', 'user')
	content = msg.get('content', '').strip()
	formatted += f"{bos_token}{role}\n{content}{eos_token}"

	# Add generation prompt
	formatted += f"{bos_token}assistant\n"

	return formatted

	@spaces.GPU
	def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
	"""Generate response using the selected model"""

	try:
	model_pipe, tokenizer = initialize_model(model_name)
	except Exception as e:
	return f"Error loading model {model_name}: {str(e)}"

	# Format the conversation history
	messages = []
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	try:
	# Format the conversation using the chat template
	formatted_prompt = format_conversation_with_template(messages, tokenizer)

	# Different generation parameters based on model
	if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
	# 8B model needs special token handling to prevent repetition
	stop_tokens = [
	"<[end▁of▁sentence]>", # EOS token
	"<[begin▁of▁sentence]>", # BOS token (shouldn't appear mid-generation)
	"user\n", # Stop if model tries to continue conversation
	"system\n", # Stop if model tries to add system messages
	"\nuser", # Alternative format
	"\nsystem" # Alternative format
	]

	response = model_pipe(
	formatted_prompt,
	max_new_tokens=max_length,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=1, # PAD token ID from config
	eos_token_id=2, # EOS token ID from config
	bos_token_id=0, # BOS token ID from config
	return_full_text=False,
	repetition_penalty=1.1, # Reduce loops
	stop_sequence=stop_tokens[0] # Primary stop token
	)
	else:
	# 2B model - standard generation without special handling
	response = model_pipe(
	formatted_prompt,
	max_new_tokens=max_length,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	return_full_text=False,
	repetition_penalty=1.05 # Light repetition penalty
	)

	if isinstance(response, list) and len(response) > 0:
	generated_text = response[0]['generated_text']
	else:
	generated_text = str(response)

	# Clean up the response
	assistant_response = str(generated_text).strip()

	# Apply different cleanup based on model
	if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
	# More aggressive cleanup for 8B model
	stop_tokens = [
	"<[end▁of▁sentence]>", "<[begin▁of▁sentence]>",
	"user\n", "system\n", "\nuser", "\nsystem"
	]

	for stop_token in stop_tokens:
	if stop_token in assistant_response:
	assistant_response = assistant_response.split(stop_token)[0].strip()

	# Additional cleanup for common repetition patterns
	lines = assistant_response.split('\n')
	cleaned_lines = []
	for line in lines:
	if line.strip() and not line.strip().startswith(('user', 'assistant', 'system')):
	cleaned_lines.append(line)
	assistant_response = '\n'.join(cleaned_lines).strip()
	else:
	# Standard cleanup for 2B model
	if assistant_response.startswith("assistant\n"):
	assistant_response = assistant_response[10:].strip()

	return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response. Please try again."

	except Exception as e:
	return f"Error generating response: {str(e)}"

	def create_interface():
	with gr.Blocks(title="Daedalus-1-8B Chat", theme=gr.themes.Base(primary_hue="green")) as demo:
	gr.Markdown("""
	# 🟢 Daedalus Chat Interface

	Chat with Daedalus models by Noema Research.
	""")

	# Model selection dropdown
	model_dropdown = gr.Dropdown(
	choices=list(AVAILABLE_MODELS.keys()),
	value="Daedalus-1-2B", # Default to 2B model
	label="Select Model",
	info="Choose between Daedalus-1-2B (faster) or Daedalus-1-8B (more capable)"
	)

	chatbot = gr.Chatbot(
	height=400,
	placeholder="Start chatting with Daedalus-1-8B...",
	label="Chat"
	)

	msg = gr.Textbox(
	placeholder="Type your message here...",
	label="Message",
	lines=2
	)

	with gr.Row():
	submit_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear Chat", variant="secondary")

	with gr.Accordion("Advanced Settings", open=False):
	max_length = gr.Slider(
	minimum=200,
	maximum=4096, # Reduced from 8192 to prevent memory issues
	value=1024, # Reduced default from 2048
	step=50,
	label="Max New Tokens",
	info="Maximum number of new tokens to generate"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Controls randomness in generation"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.1,
	label="Top P",
	info="Controls diversity via nucleus sampling"
	)

	def user_message(message, history):
	return "", history + [[message, None]]

	def bot_response(history, selected_model, max_len, temp, top_p):
	if history:
	user_message = history[-1][0]
	bot_message = generate_response(
	user_message,
	history[:-1],
	selected_model, # Use selected model
	max_len,
	temp,
	top_p
	)
	history[-1][1] = bot_message
	return history

	msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
	bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
	)

	submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
	bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
	)

	clear_btn.click(lambda: None, None, chatbot, queue=False)

	gr.Markdown("""
	---

	### About Daedalus Models

	Daedalus-1-2B: Faster, lightweight model for quick responses and basic coding tasks.

	Daedalus-1-8B: More capable model with advanced reasoning, fine-tuned for structured outputs,
	debugging, and long-context reasoning (up to ~64K tokens).

	Both models are optimized for:
	- Conversational AI
	- Code generation & debugging
	- Structured JSON/function outputs
	- Multi-step reasoning
	""")

	return demo

	# Launch the app
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(share=True)