DiffuCoder / app.py
mrfakename's picture
Update app.py
4cc5c27 verified
import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
# Load model and tokenizer
model_path = "apple/DiffuCoder-7B-cpGRPO"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True
).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.eos_token = "<|im_end|>"
@spaces.GPU
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
# Format prompt using chat template
messages = [
{"role": "system", "content": "You are a helpful coding assistant."},
{"role": "user", "content": query.strip()}
]
# Apply chat template - this creates the prompt but doesn't include assistant response
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize only the prompt (without any assistant response)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
# Calculate initial prompt length - this is where the assistant response will start
initial_prompt_len = input_ids.shape[1]
# Track EOS status
eos_detected = False
# Generate with token streaming
TOKEN_PER_STEP = 1
steps = min(max_new_tokens // TOKEN_PER_STEP, 512) # Limit to max 512 steps
# This will accumulate only the assistant's response
assistant_response = ""
for i in range(steps):
if eos_detected:
break
output = model.diffusion_generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=TOKEN_PER_STEP,
output_history=True,
return_dict_in_generate=True,
steps=1,
temperature=temperature,
top_p=top_p,
alg="entropy",
alg_temp=0.,
)
# Get only the new tokens generated in this step
new_token_ids = output.sequences[0, -TOKEN_PER_STEP:]
# Check for EOS token in the new tokens
if tokenizer.eos_token_id in new_token_ids:
# If EOS is found, stop after this token
eos_detected = True
# Remove EOS token from output
new_token_ids = new_token_ids[new_token_ids != tokenizer.eos_token_id]
if new_token_ids.numel() == 0:
# Only EOS was generated, nothing to add
break
# Decode only the new tokens
new_text = tokenizer.decode(
new_token_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
# Update input for next step
input_ids = output.sequences
attention_mask = torch.cat([
attention_mask,
torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
], dim=1)
# Append to assistant response and yield
assistant_response += new_text
# Remove any trailing special tokens
clean_response = assistant_response.replace('<|dlm_pad|>', '').strip()
yield clean_response
if eos_detected:
break
# Create Gradio interface
demo = gr.Interface(
fn=generate_code,
inputs=[
gr.Textbox(label="Code Request", lines=3,
placeholder="Describe the code you want..."),
gr.Slider(0.1, 1.0, value=0.4, label="Temperature"),
gr.Slider(0.5, 1.0, value=0.95, label="Top-p"),
gr.Slider(32, 512, value=256, step=32, label="Max Tokens")
],
outputs=gr.Textbox(label="Generated Code", lines=10),
title="🧠 DiffuCoder Code Generator",
description="Generate code with Apple's DiffuCoder-7B model"
)
# Run the demo
if __name__ == "__main__":
demo.queue().launch()