import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer

# Load model and tokenizer
model_path = "apple/DiffuCoder-7B-cpGRPO"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.eos_token = "<|im_end|>"

@spaces.GPU
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
    # Format prompt using chat template
    messages = [
        {"role": "system", "content": "You are a helpful coding assistant."},
        {"role": "user", "content": query.strip()}
    ]
    
    # Apply chat template - this creates the prompt but doesn't include assistant response
    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # Tokenize only the prompt (without any assistant response)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Calculate initial prompt length - this is where the assistant response will start
    initial_prompt_len = input_ids.shape[1]
    
    # Track EOS status
    eos_detected = False
    
    # Generate with token streaming
    TOKEN_PER_STEP = 1
    steps = min(max_new_tokens // TOKEN_PER_STEP, 512)  # Limit to max 512 steps
    
    # This will accumulate only the assistant's response
    assistant_response = ""
    
    for i in range(steps):
        if eos_detected:
            break
            
        output = model.diffusion_generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=TOKEN_PER_STEP,
            output_history=True,
            return_dict_in_generate=True,
            steps=1,
            temperature=temperature,
            top_p=top_p,
            alg="entropy",
            alg_temp=0.,
        )
        
        # Get only the new tokens generated in this step
        new_token_ids = output.sequences[0, -TOKEN_PER_STEP:]
        
        # Check for EOS token in the new tokens
        if tokenizer.eos_token_id in new_token_ids:
            # If EOS is found, stop after this token
            eos_detected = True
            # Remove EOS token from output
            new_token_ids = new_token_ids[new_token_ids != tokenizer.eos_token_id]
            if new_token_ids.numel() == 0:
                # Only EOS was generated, nothing to add
                break
        
        # Decode only the new tokens
        new_text = tokenizer.decode(
            new_token_ids, 
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
        
        # Update input for next step
        input_ids = output.sequences
        attention_mask = torch.cat([
            attention_mask, 
            torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
        ], dim=1)
        
        # Append to assistant response and yield
        assistant_response += new_text
        # Remove any trailing special tokens
        clean_response = assistant_response.replace('<|dlm_pad|>', '').strip()
        yield clean_response
        
        if eos_detected:
            break

# Create Gradio interface
demo = gr.Interface(
    fn=generate_code,
    inputs=[
        gr.Textbox(label="Code Request", lines=3, 
                  placeholder="Describe the code you want..."),
        gr.Slider(0.1, 1.0, value=0.4, label="Temperature"),
        gr.Slider(0.5, 1.0, value=0.95, label="Top-p"),
        gr.Slider(32, 512, value=256, step=32, label="Max Tokens")
    ],
    outputs=gr.Textbox(label="Generated Code", lines=10),
    title="🧠 DiffuCoder Code Generator",
    description="Generate code with Apple's DiffuCoder-7B model"
)

# Run the demo
if __name__ == "__main__":
    demo.queue().launch()