File size: 7,713 Bytes
7252f98 9aaa660 31e34c4 9aaa660 31e34c4 42ed840 332db3a 31e34c4 332db3a bd9baef 332db3a 7252f98 332db3a 7252f98 92e70ff db84545 7252f98 63d4168 0e840df 63d4168 b5f844d dc427d9 b3de773 63d4168 dc427d9 2ba8b3f 13b1370 7252f98 b1cf46e 4152853 b1cf46e fb56411 31e34c4 fb56411 31e34c4 fb56411 31e34c4 fb56411 fa10798 63d4168 fa10798 fb56411 3f5293d fa10798 3f5293d b1cf46e 7252f98 2736195 fb56411 2736195 3f5293d fb56411 3f5293d fb56411 31e34c4 f86092a fb56411 31e34c4 6fba00f 7252f98 fb56411 31e34c4 12738e5 7252f98 6fba00f db84545 8e98890 6fba00f a494446 31e34c4 fb56411 6fba00f fb56411 31e34c4 8cb5f7a d29da35 fb56411 8e98890 31e34c4 fb56411 8e98890 63d4168 31e34c4 63d4168 86c363a 63d4168 86c363a 6fba00f 86c363a 7252f98 fb56411 d86917b fb56411 d86917b fb56411 d86917b 63d4168 d86917b 3f5293d 31e34c4 3f5293d 31e34c4 ec83427 31e34c4 ec83427 31e34c4 20ff8b2 3f5293d 55b43fa 20ff8b2 63d4168 20ff8b2 3f5293d fa10798 3f5293d fa10798 3f5293d 3f7f1a0 f7efac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import gradio as gr
import torch
import numpy as np
import json
import time
from transformers import AutoTokenizer
import os
import importlib
import os
from huggingface_hub import hf_hub_download
import spaces
from dotenv import load_dotenv
from infer import (
load_trained_model,
find_answer_start,
get_noising_schedule,
noisify_answer,
filter_logits,
confidence_guided_noising,
noisify_answer_without_remasking
)
from models import CustomTransformerModel
from model_config import CustomTransformerConfig
# Load .env only when running locally
if os.getenv("HF_TOKEN") is None:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
raise ValueError("HF_TOKEN is not set")
rng = np.random.default_rng()
@spaces.GPU
def generate_diffusion_text(input_ids, top_p, top_k):
with torch.no_grad():
input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
with torch.cuda.amp.autocast(dtype=torch.float16):
logits = model(input_ids=input_tensor)["logits"]
logits = filter_logits(logits, top_k=top_k, top_p=top_p)
logits = logits.clamp(min=-1e8, max=1e4)
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
probs = torch.clamp(probs, min=1e-8, max=1.0)
# assert torch.all(torch.isfinite(probs)), "Non-finite values in probs!"
# assert (probs >= 0).all(), "Negative probs!"
sampled = torch.multinomial(probs, num_samples=1).squeeze(-1).tolist()
conf = probs[range(len(sampled)), sampled].cpu().numpy()
return sampled, conf
def format_chat_prompt(question):
return (
"<|begin_of_text|>\n"
"<|start_header_id|>system<|end_header_id|>\n"
"You are a helpful assistant.\n"
"<|start_header_id|>user<|end_header_id|>\n"
f"{question}\n"
"<|start_header_id|>assistant<|end_header_id|>\n"
)
def render_html(label, text):
return f"<b>{label}</b><br><div style='white-space: pre-wrap; line-height:1.8'>{text}</div>"
def highlight_tokens(token_ids, answer_start, changed_indices, color):
tokens = tokenizer.convert_ids_to_tokens(token_ids)
highlighted = []
for j, tok in enumerate(tokens):
if tokenizer.convert_tokens_to_ids(tok) == eos_token_id:
continue
tok_str = tokenizer.convert_tokens_to_string([tok])
if (answer_start + j) in changed_indices:
highlighted.append(f'<span style="color:{color}">{tok_str}</span>')
else:
highlighted.append(tok_str)
return "".join(highlighted)
def diffusion_chat(question, noising, enable_pause, max_it):
sharpness = 3.0
noise_start = 0.5
top_p = 1.0
top_k = 10
clustering = False
pause_length = 1.0 if enable_pause else 0.0
if question.strip() == "":
question = "What do you know about Amsterdam?"
prompt = format_chat_prompt(question)
input_ids = tokenizer.encode(prompt, add_special_tokens=False)
answer_start = find_answer_start(input_ids, assistant_marker_ids)
if answer_start is None:
yield render_html("Error", "Could not find Assistant marker in input.")
return
input_ids = (input_ids + [mask_token_id] * (256 - len(input_ids)))[:256]
ori_input_tokens = input_ids
# Initial noising
current_tokens, just_noised_indices = noisify_answer(
input_ids, answer_start, tokenizer, threshold=1.0, clustering=clustering, noise_start=1.0
)
yield render_html("Iteration 0 (initial noise)",
highlight_tokens(current_tokens[answer_start:], answer_start, just_noised_indices, color="red"))
start = time.perf_counter()
last_tokens = []
prev_decoded = []
unmasked_mask = [False] * len(current_tokens)
for i in range(max_it):
generated_tokens, confidences = generate_diffusion_text(current_tokens, top_p, top_k)
current_tokens = ori_input_tokens[:answer_start] + generated_tokens[answer_start:]
# GREEN highlighting: compare to previous tokens
new_decoded = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
diff_indices = {
answer_start + j for j, tok in enumerate(new_decoded)
if j >= len(prev_decoded) or tok != prev_decoded[j]
}
prev_decoded = new_decoded
time.sleep(max(pause_length - (time.perf_counter() - start), 0))
yield render_html(f"Iteration {i+1}/{max_it} (after generation)",
highlight_tokens(current_tokens[answer_start:], answer_start, diff_indices, color="green"))
time.sleep(pause_length)
# Early stopping
last_tokens.append(current_tokens)
if len(last_tokens) > 3:
last_tokens.pop(0)
if len(last_tokens) == 3 and last_tokens[0] == last_tokens[1] == last_tokens[2]:
yield render_html("Stopped early", f"After {i+1} iterations.")
break
# NOISING
if i < max_it-1 and noising:
threshold = get_noising_schedule(i, max_it, sharpness=sharpness)
noised_answer, just_noised_indices = noisify_answer(
current_tokens, answer_start, tokenizer,
threshold=threshold, clustering=clustering, noise_start=noise_start
)
for idx in range(answer_start, len(current_tokens)):
if noised_answer[idx] != mask_token_id:
unmasked_mask[idx] = True
yield render_html(f"Iteration {i+1}/{max_it} (before noising)",
highlight_tokens(current_tokens[answer_start:], answer_start, just_noised_indices, color="red"))
start = time.perf_counter()
current_tokens = ori_input_tokens[:answer_start] + noised_answer[answer_start:]
# Final output
answer_ids = current_tokens[answer_start:]
try:
final_ids = answer_ids[:answer_ids.index(eos_token_id)]
except ValueError:
final_ids = answer_ids
final_output = tokenizer.decode(final_ids, skip_special_tokens=True)
yield render_html(f"Final Output ({len(final_ids)} tokens after {i+1} iterations)", final_output) # type: ignore
def is_running_on_spaces():
return os.getenv("SPACE_ID") is not None
print("Loading model...")
if is_running_on_spaces():
# Load from Hugging Face Hub
ckpt_path = hf_hub_download(
repo_id="ruurd/tini_model",
filename="diffusion-model-8B.pth",
token=os.getenv("HF_TOKEN")
)
else:
# Load from local path
ckpt_path = "diffusion-model-8B.pth" # change to your actual local path
model, tokenizer = load_trained_model(checkpoint_path=ckpt_path)
print("✅ Model loaded.")
vocab_size = len(tokenizer)
eos_token_id = tokenizer.eos_token_id
mask_token_id = tokenizer.encode('MASK', add_special_tokens=False)[0]
assistant_marker_ids = tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>\n", add_special_tokens=False)
demo = gr.Interface(
fn=diffusion_chat,
inputs=[
gr.Textbox(
label="User Question",
lines=2,
placeholder="What do you know about Amsterdam?",
),
gr.Checkbox(label="Enable intermediate noising", value=True),
gr.Checkbox(label="Pause between iterations", value=False),
gr.Slider(1, 512, value=64, step=1, label="Increase the maximum number of iterations."),
],
outputs=gr.HTML(label="Diffusion Output"),
title="LAD Chat",
allow_flagging="never",
live=False # ensures the Stop button appears properly
)
demo.launch(share=True, allowed_paths=["."], ssr_mode=False)
|