File size: 11,200 Bytes
f76f5bc
 
2c56220
f76f5bc
d75f179
f76f5bc
949aa02
 
2c56220
949aa02
2410ca2
949aa02
8d4bde5
9319628
949aa02
dc45496
2410ca2
 
 
f76f5bc
949aa02
2c56220
949aa02
 
 
 
 
 
 
 
 
2c56220
 
 
 
 
 
949aa02
 
 
2c56220
949aa02
 
 
 
330c803
949aa02
2c56220
 
 
 
 
949aa02
 
 
2c56220
949aa02
 
 
 
 
2c56220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2190cc9
 
 
2c56220
 
 
 
 
 
 
 
 
 
 
 
 
 
f76f5bc
 
949aa02
 
f76f5bc
949aa02
2c56220
949aa02
330c803
dc45496
f76f5bc
 
 
 
 
ccb939a
f76f5bc
 
 
 
2190cc9
 
 
2410ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2190cc9
 
 
 
 
 
2410ca2
2190cc9
 
2410ca2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2190cc9
 
f76f5bc
 
330c803
f76f5bc
 
330c803
f76f5bc
2410ca2
f76f5bc
2410ca2
f76f5bc
 
2410ca2
 
 
 
 
 
 
 
330c803
 
 
 
 
f76f5bc
 
 
 
 
 
 
 
 
 
 
 
 
e5eb33d
2190cc9
 
f76f5bc
2c56220
 
f76f5bc
 
 
 
 
 
949aa02
 
f76f5bc
 
 
 
 
 
949aa02
 
f76f5bc
 
 
 
 
2410ca2
f76f5bc
 
330c803
f76f5bc
 
2410ca2
f76f5bc
 
 
 
330c803
 
949aa02
330c803
2410ca2
f76f5bc
 
330c803
2410ca2
f76f5bc
 
330c803
949aa02
f76f5bc
 
 
2410ca2
 
 
 
 
 
f76f5bc
2410ca2
330c803
 
 
 
f76f5bc
 
 
 
 
 
 
2c56220
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import gradio as gr
import spaces
from transformers import pipeline, AutoTokenizer
import torch
from typing import List, Dict, Optional

# Global variable to store pipelines
model_cache = {}
tokenizer_cache = {}

# Available models
AVAILABLE_MODELS = {
    "Daedalus-1-2B": "NoemaResearch/Daedalus-1-2B",
    "Daedalus-1-8B": "NoemaResearch/Daedalus-1-8B",
}

# Models that need special token handling for repetition issues
MODELS_NEEDING_SPECIAL_HANDLING = {"Daedalus-1-8B"}

@spaces.GPU
def initialize_model(model_name):
    global model_cache, tokenizer_cache
    
    if model_name not in AVAILABLE_MODELS:
        raise ValueError(f"Model {model_name} not found in available models")
    
    model_id = AVAILABLE_MODELS[model_name]
    
    # Check if model is already cached
    if model_id not in model_cache:
        try:
            # Load tokenizer separately to handle chat template properly
            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
                model_id, 
                trust_remote_code=True
            )
            
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                tokenizer=tokenizer_cache[model_id],
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        except Exception:
            # Fallback to CPU if GPU fails
            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
                model_id, 
                trust_remote_code=True
            )
            
            model_cache[model_id] = pipeline(
                "text-generation", 
                model=model_id,
                tokenizer=tokenizer_cache[model_id],
                torch_dtype=torch.float32,
                device_map="cpu",
                trust_remote_code=True
            )
    
    return model_cache[model_id], tokenizer_cache[model_id]

def format_conversation_with_template(messages: List[Dict], tokenizer) -> str:
    """Manually apply the chat template to ensure proper formatting"""
    
    # Get the chat template
    if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
        try:
            # Use the tokenizer's apply_chat_template method
            formatted = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
            )
            return formatted
        except Exception as e:
            print(f"Chat template application failed: {e}")
            # Fall back to manual formatting
            pass
    
    # Manual fallback formatting using actual special tokens
    bos_token = "<[begin▁of▁sentence]>"
    eos_token = "<[end▁of▁sentence]>"
    
    # Start with system message
    formatted = f"{bos_token}system\nYou are an AI Coding model called Daedalus, developed by Noema Research{eos_token}"
    
    # Add each message
    for msg in messages:
        role = msg.get('role', 'user')
        content = msg.get('content', '').strip()
        formatted += f"{bos_token}{role}\n{content}{eos_token}"
    
    # Add generation prompt
    formatted += f"{bos_token}assistant\n"
    
    return formatted

@spaces.GPU
def generate_response(message, history, model_name, max_length=512, temperature=0.7, top_p=0.9):
    """Generate response using the selected model"""
    
    try:
        model_pipe, tokenizer = initialize_model(model_name)
    except Exception as e:
        return f"Error loading model {model_name}: {str(e)}"
    
    # Format the conversation history
    messages = []
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    
    messages.append({"role": "user", "content": message})
    
    try:
        # Format the conversation using the chat template
        formatted_prompt = format_conversation_with_template(messages, tokenizer)
        
        # Different generation parameters based on model
        if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
            # 8B model needs special token handling to prevent repetition
            stop_tokens = [
                "<[end▁of▁sentence]>",  # EOS token
                "<[begin▁of▁sentence]>",  # BOS token (shouldn't appear mid-generation)
                "user\n",  # Stop if model tries to continue conversation
                "system\n",  # Stop if model tries to add system messages
                "\nuser",  # Alternative format
                "\nsystem"  # Alternative format
            ]
            
            response = model_pipe(
                formatted_prompt,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=1,  # PAD token ID from config
                eos_token_id=2,  # EOS token ID from config
                bos_token_id=0,  # BOS token ID from config
                return_full_text=False,
                repetition_penalty=1.1,  # Reduce loops
                stop_sequence=stop_tokens[0]  # Primary stop token
            )
        else:
            # 2B model - standard generation without special handling
            response = model_pipe(
                formatted_prompt,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                return_full_text=False,
                repetition_penalty=1.05  # Light repetition penalty
            )
        
        if isinstance(response, list) and len(response) > 0:
            generated_text = response[0]['generated_text']
        else:
            generated_text = str(response)
        
        # Clean up the response
        assistant_response = str(generated_text).strip()
        
        # Apply different cleanup based on model
        if model_name in MODELS_NEEDING_SPECIAL_HANDLING:
            # More aggressive cleanup for 8B model
            stop_tokens = [
                "<[end▁of▁sentence]>", "<[begin▁of▁sentence]>",
                "user\n", "system\n", "\nuser", "\nsystem"
            ]
            
            for stop_token in stop_tokens:
                if stop_token in assistant_response:
                    assistant_response = assistant_response.split(stop_token)[0].strip()
            
            # Additional cleanup for common repetition patterns
            lines = assistant_response.split('\n')
            cleaned_lines = []
            for line in lines:
                if line.strip() and not line.strip().startswith(('user', 'assistant', 'system')):
                    cleaned_lines.append(line)
            assistant_response = '\n'.join(cleaned_lines).strip()
        else:
            # Standard cleanup for 2B model
            if assistant_response.startswith("assistant\n"):
                assistant_response = assistant_response[10:].strip()
        
        return assistant_response if assistant_response else "I apologize, but I couldn't generate a proper response. Please try again."
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

def create_interface():
    with gr.Blocks(title="Daedalus-1-8B Chat", theme=gr.themes.Base(primary_hue="green")) as demo:
        gr.Markdown("""
        # 🟢 Daedalus Chat Interface
        
        Chat with **Daedalus models** by Noema Research.
        """)
        
        # Model selection dropdown
        model_dropdown = gr.Dropdown(
            choices=list(AVAILABLE_MODELS.keys()),
            value="Daedalus-1-2B",  # Default to 2B model
            label="Select Model",
            info="Choose between Daedalus-1-2B (faster) or Daedalus-1-8B (more capable)"
        )
        
        chatbot = gr.Chatbot(
            height=400,
            placeholder="Start chatting with Daedalus-1-8B...",
            label="Chat"
        )
        
        msg = gr.Textbox(
            placeholder="Type your message here...",
            label="Message",
            lines=2
        )
        
        with gr.Row():
            submit_btn = gr.Button("Send", variant="primary")
            clear_btn = gr.Button("Clear Chat", variant="secondary")
        
        with gr.Accordion("Advanced Settings", open=False):
            max_length = gr.Slider(
                minimum=200,
                maximum=4096,  # Reduced from 8192 to prevent memory issues
                value=1024,    # Reduced default from 2048
                step=50,
                label="Max New Tokens",
                info="Maximum number of new tokens to generate"
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Controls randomness in generation"
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.1,
                label="Top P",
                info="Controls diversity via nucleus sampling"
            )
        
        def user_message(message, history):
            return "", history + [[message, None]]
        
        def bot_response(history, selected_model, max_len, temp, top_p):
            if history:
                user_message = history[-1][0]
                bot_message = generate_response(
                    user_message, 
                    history[:-1], 
                    selected_model,  # Use selected model
                    max_len, 
                    temp, 
                    top_p
                )
                history[-1][1] = bot_message
            return history
        
        msg.submit(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
        )
        
        submit_btn.click(user_message, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, model_dropdown, max_length, temperature, top_p], chatbot
        )
        
        clear_btn.click(lambda: None, None, chatbot, queue=False)
        
        gr.Markdown("""
        ---
        
        ### About Daedalus Models
        
        **Daedalus-1-2B:** Faster, lightweight model for quick responses and basic coding tasks.
        
        **Daedalus-1-8B:** More capable model with advanced reasoning, fine-tuned for structured outputs, 
        debugging, and long-context reasoning (up to ~64K tokens).
        
        Both models are optimized for:
        - Conversational AI
        - Code generation & debugging
        - Structured JSON/function outputs
        - Multi-step reasoning
        """)
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)