Spaces:

Ruurd
/

lad

Running on Zero

lad

File size: 9,671 Bytes

7252f98
 
 
 
 
 
 
9aaa660
 
42ed840
332db3a
 
 
 
 
 
 
 
 
bd9baef
 
332db3a
 
 
 
7252f98
 
 
332db3a
 
7252f98
 
 
2ba8b3f
d86917b
2ba8b3f
cfffc32
02f6e21
cfffc32
6c7f510
cfffc32
d86917b
 
 
13b1370
6c7f510
 
d86917b
6034d83
6c7f510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acc4845
2ba8b3f
6c7f510
 
 
92e70ff
db84545
7252f98
 
0e840df
 
db84545
b5f844d
dc427d9
b3de773
dc427d9
 
 
2ba8b3f
 
 
13b1370
7252f98
b1cf46e
4152853
 
 
 
 
 
 
b1cf46e
 
fb56411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f5293d
db84545
 
 
fb56411
3f5293d
fb56411
3f5293d
b1cf46e
7252f98
 
2736195
fb56411
2736195
3f5293d
fb56411
3f5293d
fb56411
f86092a
fb56411
 
 
 
fc90b53
7252f98
fb56411
 
12738e5
7252f98
db84545
8e98890
a494446
fb56411
 
 
 
 
 
8cb5f7a
d29da35
fb56411
8e98890
 
 
fb56411
 
8e98890
 
fb56411
d29da35
 
a3a4100
fb56411
 
9756472
3f5293d
a494446
fb56411
 
d29da35
7252f98
fb56411
 
 
 
d29da35
fb56411
7252f98
fb56411
d86917b
 
fb56411
d86917b
 
fb56411
d86917b
fb56411
d86917b
3f5293d
 
 
332db3a
 
fb56411
332db3a
 
20ff8b2
3f5293d
55b43fa
20ff8b2
 
 
 
 
3f5293d
 
 
7065c9f
db84545
 
 
 
 
8cb5f7a
db84545
800af7e
 
3f5293d
 
 
 
 
 
 
3f7f1a0
 
f7efac8

import gradio as gr
import torch
import numpy as np
import json
import time
from transformers import AutoTokenizer
import os
import importlib
from huggingface_hub import hf_hub_download
import spaces
from dotenv import load_dotenv
from infer import (
    load_trained_model,
    find_answer_start,
    get_noising_schedule,
    noisify_answer,
    generate_diffusion_text,
    filter_logits
)
from models import CustomTransformerModel
from model_config import CustomTransformerConfig

# Load .env only when running locally
if os.getenv("HF_TOKEN") is None:
    load_dotenv()

hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    raise ValueError("HF_TOKEN is not set")

rng = np.random.default_rng()

# Add new noising function
def confidence_guided_noising(input_ids, answer_start, confidences, noise_clipping, threshold=1.0, noise_start=1.0):
    noised = input_ids.copy()
    answer_len = len(input_ids) - answer_start
    num_to_noise = int(threshold * answer_len * noise_start)
    if num_to_noise == 0:
        return noised, []

    all_indices = np.arange(answer_start, len(input_ids))
    eos_indices = [i for i in all_indices if input_ids[i] == eos_token_id]
    non_eos_indices = [i for i in all_indices if input_ids[i] != eos_token_id]

    # Proportionally split how many to noise
    num_non_eos_to_noise = int(num_to_noise * len(non_eos_indices) / (len(non_eos_indices) + len(eos_indices) + 1e-5))
    num_eos_to_noise = num_to_noise - num_non_eos_to_noise

    noised_indices = []

    # --- Non-EOS ---
    if non_eos_indices:
        raw_weights = 1.0 - np.array([confidences[i - answer_start] for i in non_eos_indices])
        raw_weights = np.clip(raw_weights, a_min=noise_clipping, a_max=None)
        weights = raw_weights / raw_weights.sum()

        chosen = rng.choice(non_eos_indices, size=min(num_non_eos_to_noise, len(non_eos_indices)), replace=False, p=weights)
        noised_indices.extend(chosen.tolist())

    # --- EOS ---
    if eos_indices and num_eos_to_noise > 0:
        raw_weights = 1.0 - np.array([confidences[i - answer_start] for i in eos_indices])
        raw_weights = np.clip(raw_weights, a_min=noise_clipping, a_max=None)
        weights = raw_weights / raw_weights.sum()

        chosen = rng.choice(eos_indices, size=min(num_eos_to_noise, len(eos_indices)), replace=False, p=weights)
        noised_indices.extend(chosen.tolist())

    for idx in noised_indices:
        noised[idx] = mask_token_id

    noised_indices = sorted(noised_indices)
    return noised, noised_indices

@spaces.GPU
def generate_diffusion_text(input_ids, top_p, top_k):
    with torch.no_grad():
        input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
        with torch.amp.autocast('cuda', dtype=torch.float16):
            logits = model(input_ids=input_tensor)["logits"]
        logits = filter_logits(logits, top_k=top_p, top_p=top_k) 
        logits = logits.clamp(min=-1e8, max=1e4)
        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
        probs = torch.clamp(probs, min=1e-8, max=1.0)
        assert torch.all(torch.isfinite(probs)), "Non-finite values in probs!"
        assert (probs >= 0).all(), "Negative probs!"
        sampled = torch.multinomial(probs, num_samples=1).squeeze(-1).tolist()

        # Extract confidence of selected tokens
        conf = probs[range(len(sampled)), sampled].cpu().numpy()
    return sampled, conf 

def format_chat_prompt(question):
    return (
        "<|begin_of_text|>\n"
        "<|start_header_id|>system<|end_header_id|>\n"
        "You are a helpful assistant.\n"
        "<|start_header_id|>user<|end_header_id|>\n"
        f"{question}\n"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )

def render_html(label, text):
    return f"<b>{label}</b><br><div style='white-space: pre-wrap; line-height:1.8'>{text}</div>"

def highlight_tokens(tokens, color_indices=None, color="green"):
    highlighted = []
    for j, tok in enumerate(tokens):
        if tokenizer.convert_tokens_to_ids(tok) == eos_token_id:
            continue
        token_str = tokenizer.convert_tokens_to_string([tok])
        if color_indices and j in color_indices:
            highlighted.append(f'<span style="color:{color}">{token_str}</span>')
        else:
            highlighted.append(token_str)
    return "".join(highlighted)

# --- Inference Wrapper ---
def diffusion_chat(question, max_it, pause_length, sharpness, 
                   clustering, noise_start, use_confidence_noising, 
                   noise_clipping, top_p, top_k):

    if question.strip() == "":
        question = "What do you know about the city of Amsterdam?"

    prompt = format_chat_prompt(question)
    input_ids = tokenizer.encode(prompt, add_special_tokens=False)
    answer_start = find_answer_start(input_ids, assistant_marker_ids)
    if answer_start is None:
        yield render_html("Error", "Could not find Assistant marker in input.")
        return

    input_ids = (input_ids + [mask_token_id] * (256 - len(input_ids)))[:256]
    ori_input_tokens = input_ids

    current_tokens, just_noised_indices = noisify_answer(
        input_ids, answer_start, tokenizer, threshold=1.0, clustering=clustering, noise_start=1.0
    )
    yield render_html("Iteration 0 (initial noise)",
                      tokenizer.decode(current_tokens[answer_start:], skip_special_tokens=True))
    time.sleep(pause_length)

    last_tokens = []
    prev_tokens = []

    for i in range(max_it):
        generated_tokens, confidences = generate_diffusion_text(current_tokens, top_p, top_k)
        current_tokens = ori_input_tokens[:answer_start] + generated_tokens[answer_start:]

        decoded = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
        diff_indices = [j for j in range(len(decoded)) if j >= len(prev_tokens) or decoded[j] != prev_tokens[j]]
        prev_tokens = decoded

        yield render_html(f"Iteration {i+1}/{max_it} (after generation)",
                          highlight_tokens(decoded, diff_indices, color="green"))
        time.sleep(pause_length)

        # Early stopping
        last_tokens.append(current_tokens)
        if len(last_tokens) > 3:
            last_tokens.pop(0)
        if len(last_tokens) == 3 and len(set(map(tuple, last_tokens))) == 1:
            yield render_html("Stopped early", f"After {i+1} iterations.")
            break

        # Noising step
        threshold = get_noising_schedule(i, max_it, sharpness=sharpness)
        if use_confidence_noising:
            noised_answer, just_noised_indices = confidence_guided_noising(
                current_tokens, answer_start, confidences, noise_clipping,
                threshold=threshold, noise_start=noise_start
            )
        else:
            noised_answer, just_noised_indices = noisify_answer(
                current_tokens, answer_start, tokenizer,
                threshold=threshold, clustering=clustering, noise_start=noise_start
            )

        decoded = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
        red_indices = [j for j in range(len(decoded)) if (answer_start + j) in just_noised_indices]
        yield render_html(f"Iteration {i+1}/{max_it} (before noising)",
                          highlight_tokens(decoded, red_indices, color="red"))

        current_tokens = ori_input_tokens[:answer_start] + noised_answer[answer_start:]

    # Final output
    answer_ids = current_tokens[answer_start:]
    try:
        final_ids = answer_ids[:answer_ids.index(eos_token_id)]
    except ValueError:
        final_ids = answer_ids

    final_output = tokenizer.decode(final_ids, skip_special_tokens=True)
    yield render_html(f"Final Output ({len(final_ids)} tokens after {i+1} iterations)", final_output)


# --- Gradio Interface ---
print("Loading model...")
ckpt_path = hf_hub_download(
    repo_id="ruurd/tini_model",
    filename="diffusion-model-8B.pth",
    token=os.getenv("HF_TOKEN")
)
model, tokenizer = load_trained_model(checkpoint_path=ckpt_path)
print("✅ Model loaded.")

vocab_size = len(tokenizer)
eos_token_id = tokenizer.eos_token_id
mask_token_id = tokenizer.encode('MASK', add_special_tokens=False)[0]
assistant_marker_ids = tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False)

demo = gr.Interface(
    fn=diffusion_chat,
    inputs=[
        gr.Textbox(label="User Question", lines=2, placeholder="What do you know about the city of Amsterdam?"),
        gr.Slider(1, 512, value=64, step=1, label="Number of iterarions: ↑ = more iterations"),
        gr.Slider(0.01, 5, value=0.01, step=0.01, label="Pause between iteration ↑ = longer pause"),
        gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Noise decay sharpness: ↓ = more noise in later iterations"),
        gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Clustering: ↑ = more clustered noising"),
        gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Noise start fraction: ↑ = more noise"),
        gr.Checkbox(value=False, label="Use confidence-guided noising"),
        gr.Slider(0.01, 1.0, value=0.01, step=0.01, label="Noise clipping: ↓ = more confidence guidance"),
        gr.Slider(1, 1000, value = 100, step = 1, label = "Top-p: ↑ = more random answers"),
        gr.Slider(0.0, 1.0, value = 0.9, step = 0.01, label = "Top-k: ↑ = more random answers")
    ],
    outputs=[gr.HTML(label="Diffusion Output")],
    title="Diffusion Language Model Chat",
    theme="default",
    description="This interface runs a diffusion-based language model to generate answers progressively."
)

demo.launch(share=True, allowed_paths=["."], ssr_mode=False)