Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch | |
from transformers import AutoModel, AutoTokenizer | |
# Load model and tokenizer | |
model_path = "apple/DiffuCoder-7B-cpGRPO" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = AutoModel.from_pretrained( | |
model_path, | |
torch_dtype=torch.bfloat16, | |
trust_remote_code=True | |
).to(device).eval() | |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | |
tokenizer.eos_token = "<|im_end|>" | |
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256): | |
# Format prompt using chat template | |
messages = [ | |
{"role": "system", "content": "You are a helpful coding assistant."}, | |
{"role": "user", "content": query.strip()} | |
] | |
# Apply chat template - this creates the prompt but doesn't include assistant response | |
prompt = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
# Tokenize only the prompt (without any assistant response) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs.input_ids.to(device) | |
attention_mask = inputs.attention_mask.to(device) | |
# Calculate initial prompt length - this is where the assistant response will start | |
initial_prompt_len = input_ids.shape[1] | |
# Track EOS status | |
eos_detected = False | |
# Generate with token streaming | |
TOKEN_PER_STEP = 1 | |
steps = min(max_new_tokens // TOKEN_PER_STEP, 512) # Limit to max 512 steps | |
# This will accumulate only the assistant's response | |
assistant_response = "" | |
for i in range(steps): | |
if eos_detected: | |
break | |
output = model.diffusion_generate( | |
input_ids, | |
attention_mask=attention_mask, | |
max_new_tokens=TOKEN_PER_STEP, | |
output_history=True, | |
return_dict_in_generate=True, | |
steps=1, | |
temperature=temperature, | |
top_p=top_p, | |
alg="entropy", | |
alg_temp=0., | |
) | |
# Get only the new tokens generated in this step | |
new_token_ids = output.sequences[0, -TOKEN_PER_STEP:] | |
# Check for EOS token in the new tokens | |
if tokenizer.eos_token_id in new_token_ids: | |
# If EOS is found, stop after this token | |
eos_detected = True | |
# Remove EOS token from output | |
new_token_ids = new_token_ids[new_token_ids != tokenizer.eos_token_id] | |
if new_token_ids.numel() == 0: | |
# Only EOS was generated, nothing to add | |
break | |
# Decode only the new tokens | |
new_text = tokenizer.decode( | |
new_token_ids, | |
skip_special_tokens=True, | |
clean_up_tokenization_spaces=False | |
) | |
# Update input for next step | |
input_ids = output.sequences | |
attention_mask = torch.cat([ | |
attention_mask, | |
torch.ones(1, 1, dtype=attention_mask.dtype, device=device) | |
], dim=1) | |
# Append to assistant response and yield | |
assistant_response += new_text | |
# Remove any trailing special tokens | |
clean_response = assistant_response.replace('<|dlm_pad|>', '').strip() | |
yield clean_response | |
if eos_detected: | |
break | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=generate_code, | |
inputs=[ | |
gr.Textbox(label="Code Request", lines=3, | |
placeholder="Describe the code you want..."), | |
gr.Slider(0.1, 1.0, value=0.4, label="Temperature"), | |
gr.Slider(0.5, 1.0, value=0.95, label="Top-p"), | |
gr.Slider(32, 512, value=256, step=32, label="Max Tokens") | |
], | |
outputs=gr.Textbox(label="Generated Code", lines=10), | |
title="🧠 DiffuCoder Code Generator", | |
description="Generate code with Apple's DiffuCoder-7B model" | |
) | |
# Run the demo | |
if __name__ == "__main__": | |
demo.queue().launch() |