import spaces import gradio as gr import torch from transformers import AutoModel, AutoTokenizer # Load model and tokenizer model_path = "apple/DiffuCoder-7B-cpGRPO" device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModel.from_pretrained( model_path, torch_dtype=torch.bfloat16, trust_remote_code=True ).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer.eos_token = "<|im_end|>" @spaces.GPU def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256): # Format prompt using chat template messages = [ {"role": "system", "content": "You are a helpful coding assistant."}, {"role": "user", "content": query.strip()} ] # Apply chat template - this creates the prompt but doesn't include assistant response prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize only the prompt (without any assistant response) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs.input_ids.to(device) attention_mask = inputs.attention_mask.to(device) # Calculate initial prompt length - this is where the assistant response will start initial_prompt_len = input_ids.shape[1] # Track EOS status eos_detected = False # Generate with token streaming TOKEN_PER_STEP = 1 steps = min(max_new_tokens // TOKEN_PER_STEP, 512) # Limit to max 512 steps # This will accumulate only the assistant's response assistant_response = "" for i in range(steps): if eos_detected: break output = model.diffusion_generate( input_ids, attention_mask=attention_mask, max_new_tokens=TOKEN_PER_STEP, output_history=True, return_dict_in_generate=True, steps=1, temperature=temperature, top_p=top_p, alg="entropy", alg_temp=0., ) # Get only the new tokens generated in this step new_token_ids = output.sequences[0, -TOKEN_PER_STEP:] # Check for EOS token in the new tokens if tokenizer.eos_token_id in new_token_ids: # If EOS is found, stop after this token eos_detected = True # Remove EOS token from output new_token_ids = new_token_ids[new_token_ids != tokenizer.eos_token_id] if new_token_ids.numel() == 0: # Only EOS was generated, nothing to add break # Decode only the new tokens new_text = tokenizer.decode( new_token_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False ) # Update input for next step input_ids = output.sequences attention_mask = torch.cat([ attention_mask, torch.ones(1, 1, dtype=attention_mask.dtype, device=device) ], dim=1) # Append to assistant response and yield assistant_response += new_text # Remove any trailing special tokens clean_response = assistant_response.replace('<|dlm_pad|>', '').strip() yield clean_response if eos_detected: break # Create Gradio interface demo = gr.Interface( fn=generate_code, inputs=[ gr.Textbox(label="Code Request", lines=3, placeholder="Describe the code you want..."), gr.Slider(0.1, 1.0, value=0.4, label="Temperature"), gr.Slider(0.5, 1.0, value=0.95, label="Top-p"), gr.Slider(32, 512, value=256, step=32, label="Max Tokens") ], outputs=gr.Textbox(label="Generated Code", lines=10), title="🧠 DiffuCoder Code Generator", description="Generate code with Apple's DiffuCoder-7B model" ) # Run the demo if __name__ == "__main__": demo.queue().launch()