Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,334 Bytes
7252f98 9aaa660 31e34c4 9aaa660 31e34c4 42ed840 332db3a 31e34c4 332db3a bd9baef 332db3a 7252f98 332db3a 7252f98 92e70ff a8d72d4 7252f98 a8d72d4 0e840df a8d72d4 db84545 b5f844d dc427d9 b3de773 dc427d9 2ba8b3f 13b1370 7252f98 b1cf46e 4152853 b1cf46e fb56411 31e34c4 fb56411 31e34c4 fb56411 31e34c4 fb56411 a8d72d4 db84545 31e34c4 fb56411 a8d72d4 3f5293d fb56411 3f5293d b1cf46e 7252f98 2736195 fb56411 2736195 3f5293d fb56411 3f5293d fb56411 31e34c4 f86092a fb56411 31e34c4 fc90b53 7252f98 fb56411 31e34c4 12738e5 7252f98 a8d72d4 8e98890 a494446 31e34c4 fb56411 31e34c4 8cb5f7a d29da35 fb56411 8e98890 31e34c4 fb56411 8e98890 31e34c4 86c363a 7252f98 fb56411 d86917b fb56411 d86917b fb56411 d86917b fb56411 d86917b 3f5293d 31e34c4 3f5293d 31e34c4 ec83427 31e34c4 ec83427 31e34c4 20ff8b2 3f5293d 55b43fa 20ff8b2 3f5293d 7065c9f db84545 a8d72d4 db84545 3125ce6 8cb5f7a 31e34c4 db84545 3125ce6 3f5293d 3f7f1a0 f7efac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
import gradio as gr
import torch
import numpy as np
import json
import time
from transformers import AutoTokenizer
import os
import importlib
import os
from huggingface_hub import hf_hub_download
import spaces
from dotenv import load_dotenv
from infer import (
load_trained_model,
find_answer_start,
get_noising_schedule,
noisify_answer,
generate_diffusion_text,
filter_logits,
confidence_guided_noising,
noisify_answer_without_remasking
)
from models import CustomTransformerModel
from model_config import CustomTransformerConfig
# Load .env only when running locally
if os.getenv("HF_TOKEN") is None:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
raise ValueError("HF_TOKEN is not set")
rng = np.random.default_rng()
@spaces.GPU
def generate_diffusion_text(input_ids, top_p, top_k, eos_bias=0.0):
with torch.no_grad():
input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
with torch.cuda.amp.autocast(dtype=torch.float16):
logits = model(input_ids=input_tensor)["logits"]
# Apply eos_bias
if eos_bias != 0.0:
logits[0, :, eos_token_id] += eos_bias
logits = filter_logits(logits, top_k=top_p, top_p=top_k)
logits = logits.clamp(min=-1e8, max=1e4)
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
probs = torch.clamp(probs, min=1e-8, max=1.0)
assert torch.all(torch.isfinite(probs)), "Non-finite values in probs!"
assert (probs >= 0).all(), "Negative probs!"
sampled = torch.multinomial(probs, num_samples=1).squeeze(-1).tolist()
# Extract confidence of selected tokens
conf = probs[range(len(sampled)), sampled].cpu().numpy()
return sampled, conf
def format_chat_prompt(question):
return (
"<|begin_of_text|>\n"
"<|start_header_id|>system<|end_header_id|>\n"
"You are a helpful assistant.\n"
"<|start_header_id|>user<|end_header_id|>\n"
f"{question}\n"
"<|start_header_id|>assistant<|end_header_id|>\n"
)
def render_html(label, text):
return f"<b>{label}</b><br><div style='white-space: pre-wrap; line-height:1.8'>{text}</div>"
def highlight_tokens(token_ids, answer_start, changed_indices, color):
tokens = tokenizer.convert_ids_to_tokens(token_ids)
highlighted = []
for j, tok in enumerate(tokens):
if tokenizer.convert_tokens_to_ids(tok) == eos_token_id:
continue
tok_str = tokenizer.convert_tokens_to_string([tok])
if (answer_start + j) in changed_indices:
highlighted.append(f'<span style="color:{color}">{tok_str}</span>')
else:
highlighted.append(tok_str)
return "".join(highlighted)
def diffusion_chat(question, max_it, pause_length, eos_bias, sharpness,
clustering, noise_start, use_confidence_noising,
use_permanent_unmasking, noise_clipping, top_p,
top_k):
eos_bias = -eos_bias
if question.strip() == "":
question = "What do you know about the city of Amsterdam?"
prompt = format_chat_prompt(question)
input_ids = tokenizer.encode(prompt, add_special_tokens=False)
answer_start = find_answer_start(input_ids, assistant_marker_ids)
if answer_start is None:
yield render_html("Error", "Could not find Assistant marker in input.")
return
input_ids = (input_ids + [mask_token_id] * (256 - len(input_ids)))[:256]
ori_input_tokens = input_ids
# Initial noising
current_tokens, just_noised_indices = noisify_answer(
input_ids, answer_start, tokenizer, threshold=1.0, clustering=clustering, noise_start=1.0
)
yield render_html("Iteration 0 (initial noise)",
highlight_tokens(current_tokens[answer_start:], answer_start, just_noised_indices, color="red"))
time.sleep(pause_length)
last_tokens = []
prev_decoded = []
unmasked_mask = [False] * len(current_tokens)
for i in range(max_it):
generated_tokens, confidences = generate_diffusion_text(current_tokens, top_p, top_k, eos_bias = eos_bias)
current_tokens = ori_input_tokens[:answer_start] + generated_tokens[answer_start:]
# GREEN highlighting: compare to previous tokens
new_decoded = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
diff_indices = {
answer_start + j for j, tok in enumerate(new_decoded)
if j >= len(prev_decoded) or tok != prev_decoded[j]
}
prev_decoded = new_decoded
yield render_html(f"Iteration {i+1}/{max_it} (after generation)",
highlight_tokens(current_tokens[answer_start:], answer_start, diff_indices, color="green"))
time.sleep(pause_length)
# Early stopping
last_tokens.append(current_tokens)
if len(last_tokens) > 3:
last_tokens.pop(0)
if len(last_tokens) == 3 and last_tokens[0] == last_tokens[1] == last_tokens[2]:
yield render_html("Stopped early", f"After {i+1} iterations.")
break
# NOISING
if i < max_it-1:
threshold = get_noising_schedule(i, max_it, sharpness=sharpness)
if use_confidence_noising:
noised_answer, just_noised_indices = confidence_guided_noising(
current_tokens, answer_start, tokenizer, confidences, noise_clipping,
threshold=threshold, noise_start=noise_start
)
elif use_permanent_unmasking:
noised_answer, just_noised_indices = noisify_answer_without_remasking(
current_tokens, answer_start, tokenizer, threshold=threshold,
noise_start=noise_start, unmasked_mask=unmasked_mask
)
else:
noised_answer, just_noised_indices = noisify_answer(
current_tokens, answer_start, tokenizer,
threshold=threshold, clustering=clustering, noise_start=noise_start
)
for idx in range(answer_start, len(current_tokens)):
if noised_answer[idx] != mask_token_id:
unmasked_mask[idx] = True
yield render_html(f"Iteration {i+1}/{max_it} (before noising)",
highlight_tokens(current_tokens[answer_start:], answer_start, just_noised_indices, color="red"))
current_tokens = ori_input_tokens[:answer_start] + noised_answer[answer_start:]
# Final output
answer_ids = current_tokens[answer_start:]
try:
final_ids = answer_ids[:answer_ids.index(eos_token_id)]
except ValueError:
final_ids = answer_ids
final_output = tokenizer.decode(final_ids, skip_special_tokens=True)
yield render_html(f"Final Output ({len(final_ids)} tokens after {i+1} iterations)", final_output)
def is_running_on_spaces():
return os.getenv("SPACE_ID") is not None
print("Loading model...")
if is_running_on_spaces():
# Load from Hugging Face Hub
ckpt_path = hf_hub_download(
repo_id="ruurd/tini_model",
filename="diffusion-model-8B.pth",
token=os.getenv("HF_TOKEN")
)
else:
# Load from local path
ckpt_path = "diffusion-model-8B.pth" # change to your actual local path
model, tokenizer = load_trained_model(checkpoint_path=ckpt_path)
print("✅ Model loaded.")
vocab_size = len(tokenizer)
eos_token_id = tokenizer.eos_token_id
mask_token_id = tokenizer.encode('MASK', add_special_tokens=False)[0]
assistant_marker_ids = tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False)
demo = gr.Interface(
fn=diffusion_chat,
inputs=[
gr.Textbox(label="User Question", lines=2, placeholder="What do you know about the city of Amsterdam?"),
gr.Slider(1, 512, value=64, step=1, label="Number of iterarions: ↑ = more iterations"),
gr.Slider(0.01, 5, value=0.01, step=0.01, label="Pause between iteration ↑ = longer pause"),
gr.Slider(-5.0, 5.0, value=0.0, step=0.1, label="Generation length: ↑ = more output tokens by decreasing eos token probability"),
gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Noise decay sharpness: ↓ = more noise in later iterations"),
gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Clustering: ↑ = more clustered noising"),
gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Noise start fraction: ↑ = more noise"),
gr.Checkbox(value=False, label="Use confidence-guided noising"),
gr.Checkbox(value=False, label="Use permanent unmasking"),
gr.Slider(0.01, 1.0, value=0.01, step=0.01, label="Noise clipping: ↓ = more confidence guidance"),
gr.Slider(1, 1000, value = 3, step = 1, label = "Top-p: ↑ = more random answers"),
gr.Slider(0.0, 1.0, value = 1.0, step = 0.01, label = "Top-k: ↑ = more random answers")
],
outputs=[gr.HTML(label="Diffusion Output")],
title="Diffusion Language Model Chat",
theme="default",
description="This interface runs a diffusion-based language model to generate answers progressively."
)
demo.launch(share=True, allowed_paths=["."], ssr_mode=False)
|