Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,020 Bytes
2a7fe05 9cc2d55 a0e2cb7 9cc2d55 a0e2cb7 9cc2d55 a0e2cb7 9cc2d55 6f96c51 a0e2cb7 9cc2d55 6f96c51 987b437 6f96c51 987b437 6f96c51 987b437 9cc2d55 6f96c51 9cc2d55 6f96c51 987b437 9cc2d55 6f96c51 9cc2d55 987b437 9cc2d55 a0e2cb7 6f96c51 987b437 6f96c51 987b437 6f96c51 987b437 6f96c51 9cc2d55 a0e2cb7 9cc2d55 a0e2cb7 6f96c51 987b437 a0e2cb7 9cc2d55 4cc5c27 9cc2d55 a0e2cb7 9cc2d55 a0e2cb7 9cc2d55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import spaces
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
# Load model and tokenizer
model_path = "apple/DiffuCoder-7B-cpGRPO"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True
).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.eos_token = "<|im_end|>"
@spaces.GPU
def generate_code(query, temperature=0.4, top_p=0.95, max_new_tokens=256):
# Format prompt using chat template
messages = [
{"role": "system", "content": "You are a helpful coding assistant."},
{"role": "user", "content": query.strip()}
]
# Apply chat template - this creates the prompt but doesn't include assistant response
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize only the prompt (without any assistant response)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
# Calculate initial prompt length - this is where the assistant response will start
initial_prompt_len = input_ids.shape[1]
# Track EOS status
eos_detected = False
# Generate with token streaming
TOKEN_PER_STEP = 1
steps = min(max_new_tokens // TOKEN_PER_STEP, 512) # Limit to max 512 steps
# This will accumulate only the assistant's response
assistant_response = ""
for i in range(steps):
if eos_detected:
break
output = model.diffusion_generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=TOKEN_PER_STEP,
output_history=True,
return_dict_in_generate=True,
steps=1,
temperature=temperature,
top_p=top_p,
alg="entropy",
alg_temp=0.,
)
# Get only the new tokens generated in this step
new_token_ids = output.sequences[0, -TOKEN_PER_STEP:]
# Check for EOS token in the new tokens
if tokenizer.eos_token_id in new_token_ids:
# If EOS is found, stop after this token
eos_detected = True
# Remove EOS token from output
new_token_ids = new_token_ids[new_token_ids != tokenizer.eos_token_id]
if new_token_ids.numel() == 0:
# Only EOS was generated, nothing to add
break
# Decode only the new tokens
new_text = tokenizer.decode(
new_token_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
# Update input for next step
input_ids = output.sequences
attention_mask = torch.cat([
attention_mask,
torch.ones(1, 1, dtype=attention_mask.dtype, device=device)
], dim=1)
# Append to assistant response and yield
assistant_response += new_text
# Remove any trailing special tokens
clean_response = assistant_response.replace('<|dlm_pad|>', '').strip()
yield clean_response
if eos_detected:
break
# Create Gradio interface
demo = gr.Interface(
fn=generate_code,
inputs=[
gr.Textbox(label="Code Request", lines=3,
placeholder="Describe the code you want..."),
gr.Slider(0.1, 1.0, value=0.4, label="Temperature"),
gr.Slider(0.5, 1.0, value=0.95, label="Top-p"),
gr.Slider(32, 512, value=256, step=32, label="Max Tokens")
],
outputs=gr.Textbox(label="Generated Code", lines=10),
title="🧠 DiffuCoder Code Generator",
description="Generate code with Apple's DiffuCoder-7B model"
)
# Run the demo
if __name__ == "__main__":
demo.queue().launch() |