Spaces:

PhilippSpohn
/

tokenprob

Running

App Files Files Community

PhilippSpohn commited on Feb 5

Commit

b9b96cc

0 Parent(s):

Initial commit: Token Probability Analyzer web application

Browse files

Files changed (7) hide show

.gitignore +38 -0
README.md +35 -0
app.py +98 -0
requirements.txt +4 -0
static/script.js +145 -0
static/style.css +195 -0
templates/index.html +60 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,38 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Misc
+.DS_Store
+.env
+.env.local
+.env.*.local

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Token Probability Analyzer
+A web application that analyzes token probabilities using various language models. This tool helps visualize and understand how language models predict tokens in a given text sequence.
+## Features
+- Support for multiple language models (GPT-2, TinyLlama, etc.)
+- Token-by-token probability analysis
+- Percentile scoring for token probabilities
+- Top-k predictions for each position
+- Joint and average log likelihood calculations
+## Setup
+1. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Run the application:
+```bash
+python app.py
+```
+3. Open your browser and navigate to `http://localhost:5000`
+## Usage
+1. Select a language model from the dropdown menu
+2. Enter your text in the input field
+3. Click "Analyze" to see the token probabilities and predictions
+## Technical Details
+The application uses Flask for the backend and provides a simple web interface. It leverages the Hugging Face Transformers library to load and run various language models for token probability analysis.

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from flask import Flask, render_template, request, jsonify
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import torch.nn.functional as F
+from scipy.stats import percentileofscore
+app = Flask(__name__)
+DEFAULT_MODEL = "gpt2"
+model_cache = {}
+tokenizer_cache = {}
+def get_model_and_tokenizer(model_name):
+    if model_name not in model_cache:
+        trust_code = model_name == "microsoft/phi-1_5"
+        model_cache[model_name] = AutoModelForCausalLM.from_pretrained(
+            model_name, trust_remote_code=trust_code
+        )
+        tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
+            model_name, trust_remote_code=trust_code
+        )
+    return model_cache[model_name], tokenizer_cache[model_name]
+@app.route("/")
+def index():
+    return render_template(
+        "index.html",
+        models=[
+            DEFAULT_MODEL,
+            # "gpt2-medium",
+            # "gpt2-large",
+            # "gpt2-xl",
+            # "EleutherAI/pythia-1.4b",
+            # "facebook/opt-1.3b",
+            # "bigscience/bloom-1b7",
+            # "microsoft/phi-1_5",
+            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        ],
+    )
+@app.route("/analyze", methods=["POST"])
+def analyze():
+    data = request.get_json()
+    text = data["text"]
+    model_name = data["model"]
+    model, tokenizer = get_model_and_tokenizer(model_name)
+    model.eval()
+    with torch.no_grad():
+        inputs = tokenizer(text, return_tensors="pt")
+        outputs = model(**inputs)
+        logits = outputs.logits
+        input_ids = inputs["input_ids"][0]
+        tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        log_probs = []
+        all_log_probs_list = []
+        top_k_predictions = []
+        for i in range(len(input_ids) - 1):
+            probs_at_position = F.log_softmax(logits[0, i, :], dim=-1)
+            all_log_probs_list.extend(probs_at_position.tolist())
+            top_k_values, top_k_indices = torch.topk(probs_at_position, 5)
+            top_k_tokens = tokenizer.convert_ids_to_tokens(top_k_indices)
+            top_k_predictions.append(
+                [
+                    {"token": t, "log_prob": v.item()}
+                    for t, v in zip(top_k_tokens, top_k_values)
+                ]
+            )
+            log_prob = probs_at_position[input_ids[i + 1]].item()
+            log_probs.append(log_prob)
+        percentiles = [percentileofscore(all_log_probs_list, lp) for lp in log_probs]
+        joint_log_likelihood = sum(log_probs)
+        average_log_likelihood = (
+            joint_log_likelihood / len(log_probs) if log_probs else 0
+        )
+    return jsonify({
+        "tokens": tokens,
+        "percentiles": percentiles,
+        "log_probs": log_probs,
+        "top_k_predictions": top_k_predictions,
+        "joint_log_likelihood": joint_log_likelihood,
+        "average_log_likelihood": average_log_likelihood,
+    })
+if __name__ == "__main__":
+    app.run(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+flask
+transformers
+torch
+scipy

static/script.js ADDED Viewed

	@@ -0,0 +1,145 @@

+document.getElementById("analyze-button").addEventListener("click", async () => {
+    const text = document.getElementById("input-text").value;
+    const model = document.getElementById("model-select").value;
+    const response = await fetch("/analyze", {
+        method: "POST",
+        headers: {
+            "Content-Type": "application/json"
+        },
+        body: JSON.stringify({ text, model })
+    });
+    const data = await response.json();
+    const coloredTextDiv = document.getElementById("colored-text");
+    coloredTextDiv.innerHTML = "";
+    // Always add the first token
+    const firstToken = data.tokens[0];
+    const firstTokenSpan = document.createElement("span");
+    firstTokenSpan.classList.add("token");
+    // Handle special tokens and regular tokens differently
+    if (firstToken === "<s>" || firstToken === "<|endoftext|>") {
+        firstTokenSpan.style.backgroundColor = "#808080"; // Gray for special tokens
+        firstTokenSpan.textContent = "■";
+        tippy(firstTokenSpan, {
+            content: "<div><strong>Beginning of Sequence</strong></div>",
+            allowHTML: true,
+            theme: 'custom',
+            placement: 'top',
+            interactive: true
+        });
+    } else {
+        // Handle regular first token
+        firstTokenSpan.style.backgroundColor = "#808080"; // or any other color you prefer
+        firstTokenSpan.textContent = firstToken;
+        tippy(firstTokenSpan, {
+            content: `<div><strong>First Token</strong></div>`,
+            allowHTML: true,
+            theme: 'custom',
+            placement: 'top',
+            interactive: true
+        });
+    }
+    coloredTextDiv.appendChild(firstTokenSpan);
+    for (let index = 0; index < data.log_probs.length; index++) {
+        const token = data.tokens[index + 1];
+        const percentile = data.percentiles[index];
+        const logProb = data.log_probs[index];
+        const topKPredictions = data.top_k_predictions[index];
+        const color = getColor(data.log_probs, logProb);
+        const tokenSpan = document.createElement("span");
+        tokenSpan.classList.add("token");
+        tokenSpan.style.backgroundColor = color;
+        let displayToken = token;
+        let specialTokenDescription = "";
+        // Enhanced special token handling
+        if (token === "<s>" || token === "<|endoftext|>") {
+            displayToken = "■";
+            specialTokenDescription = "Beginning of Sequence";
+        } else if (token === "</s>" || token === "<|endoftext|>") {
+            displayToken = "■";
+            specialTokenDescription = "End of Sequence";
+        } else if (token === "<0x0A>") {
+            displayToken = "■";
+            specialTokenDescription = "Newline";
+        } else if (token.startsWith("<") && token.endsWith(">")) {
+            displayToken = "■";
+            specialTokenDescription = "Special Token: " + token;
+        } else {
+            // Clean up GPT-2 style tokens (Ġ and Ċ)
+            displayToken = displayToken
+                .replace(/\u2581/g, " ")  // Replace underscore token
+                .replace(/Ġ/g, " ")       // Replace GPT-2 space token
+                .replace(/Ċ/g, "\n");     // Replace GPT-2 newline token
+        }
+        tokenSpan.textContent = displayToken;
+        let tooltipContent = "";
+        if (specialTokenDescription) {
+            tooltipContent += `<div style="font-weight: bold; margin-bottom: 8px;">${specialTokenDescription}</div>`;
+        }
+        tooltipContent += `<div style="font-weight: bold; margin-bottom: 4px;">Top 5 Predictions:</div>`;
+        topKPredictions.forEach(pred => {
+            let predToken = pred.token;
+            if (predToken === "<0x0A>") {
+                predToken = "\\n";
+            } else if (predToken.startsWith("<") && predToken.endsWith(">")) {
+                predToken = "[SPECIAL]";
+            } else {
+                predToken = predToken
+                    .replace(/\u2581/g, " ")
+                    .replace(/Ġ/g, " ")
+                    .replace(/Ċ/g, "\n");
+            }
+            tooltipContent += `<div style="padding-left: 8px;">${predToken}: ${pred.log_prob.toFixed(4)}</div>`;
+        });
+        tooltipContent += `<div style="margin-top: 8px; border-top: 1px solid #555; padding-top: 8px;">
+            <div><strong>Stats:</strong></div>
+            <div style="padding-left: 8px;">Percentile: ${percentile.toFixed(2)}</div>
+            <div style="padding-left: 8px;">Log-Likelihood: ${logProb.toFixed(4)}</div>
+        </div>`;
+        tippy(tokenSpan, {
+            content: tooltipContent,
+            allowHTML: true,
+            theme: 'custom',
+            placement: 'top',
+            interactive: true
+        });
+        coloredTextDiv.appendChild(tokenSpan);
+        if (token === "<0x0A>") {
+          coloredTextDiv.appendChild(document.createElement("br"));
+        }
+    }
+    document.getElementById("joint-log-likelihood").textContent = data.joint_log_likelihood.toFixed(4);
+    document.getElementById("average-log-likelihood").textContent = data.average_log_likelihood.toFixed(4);
+});
+function getColor(allLogProbs, currentLogProb) {
+    const minLogProb = Math.min(...allLogProbs);
+    const maxLogProb = Math.max(...allLogProbs);
+    // Normalize to 0-1 range
+    let normalizedLogProb = (currentLogProb - minLogProb) / (maxLogProb - minLogProb);
+    normalizedLogProb = Math.max(0, Math.min(1, normalizedLogProb)); // Clamp
+    // Optional: Apply a power transformation (adjust the exponent as needed)
+    const power = 0.7; // Example: Less than 1 emphasizes differences at lower end
+    normalizedLogProb = Math.pow(normalizedLogProb, power);
+    const hue = normalizedLogProb * 120; // 0 (red) to 120 (green)
+    return `hsl(${hue}, 100%, 50%)`;
+}

static/style.css ADDED Viewed

	@@ -0,0 +1,195 @@

+:root {
+    --primary-color: #2563eb;
+    --primary-hover: #1d4ed8;
+    --background-color: #f8fafc;
+    --card-background: #ffffff;
+    --text-primary: #1e293b;
+    --text-secondary: #64748b;
+    --border-color: #e2e8f0;
+    --token-hover: #f1f5f9;
+}
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body {
+    font-family: 'Inter', sans-serif;
+    background-color: var(--background-color);
+    color: var(--text-primary);
+    line-height: 1.5;
+}
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+    padding: 2rem;
+}
+header {
+    text-align: center;
+    margin-bottom: 2rem;
+}
+h1 {
+    font-size: 2.5rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 0.5rem;
+}
+.subtitle {
+    color: var(--text-secondary);
+    font-size: 1.1rem;
+}
+.control-panel {
+    background-color: var(--card-background);
+    border-radius: 12px;
+    padding: 1.5rem;
+    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+    margin-bottom: 2rem;
+}
+.input-group {
+    margin-bottom: 1.5rem;
+}
+label {
+    display: block;
+    margin-bottom: 0.5rem;
+    font-weight: 500;
+    color: var(--text-primary);
+}
+.styled-select {
+    width: 100%;
+    padding: 0.75rem;
+    border: 1px solid var(--border-color);
+    border-radius: 6px;
+    font-size: 1rem;
+    background-color: white;
+    cursor: pointer;
+}
+textarea {
+    width: 100%;
+    min-height: 120px;
+    padding: 0.75rem;
+    border: 1px solid var(--border-color);
+    border-radius: 6px;
+    font-size: 1rem;
+    font-family: inherit;
+    resize: vertical;
+}
+.primary-button {
+    background-color: var(--primary-color);
+    color: white;
+    border: none;
+    padding: 0.75rem 1.5rem;
+    border-radius: 6px;
+    font-size: 1rem;
+    font-weight: 500;
+    cursor: pointer;
+    transition: background-color 0.2s;
+}
+.primary-button:hover {
+    background-color: var(--primary-hover);
+}
+.output-panel {
+    background-color: var(--card-background);
+    border-radius: 12px;
+    padding: 1.5rem;
+    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+}
+.output-section {
+    margin-bottom: 2rem;
+}
+h2 {
+    font-size: 1.5rem;
+    margin-bottom: 1rem;
+    color: var(--text-primary);
+}
+.token-display {
+    background-color: white;
+    border-radius: 8px;
+    padding: 1rem;
+    line-height: 1.3;
+    min-height: 100px;
+    font-size: 1rem;
+    white-space: pre-wrap;
+}
+.token {
+    padding: 0;
+    border-radius: 0;
+    margin: 0;
+    cursor: pointer;
+    transition: background-color 0.15s;
+    display: inline;
+}
+.token:hover {
+    background-color: rgba(0, 0, 0, 0.05) !important;
+}
+.stats-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 1rem;
+}
+.stat-card {
+    background-color: white;
+    padding: 1rem;
+    border-radius: 8px;
+    border: 1px solid var(--border-color);
+}
+.stat-label {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    margin-bottom: 0.5rem;
+}
+.stat-value {
+    font-size: 1.25rem;
+    font-weight: 600;
+    color: var(--text-primary);
+}
+/* Tippy custom theme */
+.tippy-box[data-theme~='custom'] {
+    background-color: white;
+    color: var(--text-primary);
+    border: 1px solid var(--border-color);
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+    border-radius: 8px;
+    font-size: 0.875rem;
+}
+.tippy-box[data-theme~='custom'] .tippy-content {
+    padding: 1rem;
+}
+@media (max-width: 768px) {
+    .container {
+        padding: 1rem;
+    }
+    h1 {
+        font-size: 2rem;
+    }
+    .stats-grid {
+        grid-template-columns: 1fr;
+    }
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,60 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>LLM Token Visualization</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+    <script src="https://unpkg.com/@popperjs/core@2"></script>
+    <script src="https://unpkg.com/tippy.js@6"></script>
+    <link rel="stylesheet" href="https://unpkg.com/tippy.js@6/themes/light.css">
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&display=swap" rel="stylesheet">
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>LLM Token Visualization</h1>
+            <p class="subtitle">Analyze how language models process and predict text</p>
+        </header>
+        <div class="control-panel">
+            <div class="input-group">
+                <label for="model-select">Model:</label>
+                <select id="model-select" class="styled-select">
+                    {% for model in models %}
+                        <option value="{{ model }}">{{ model }}</option>
+                    {% endfor %}
+                </select>
+            </div>
+            <div class="input-group">
+                <label for="input-text">Text to Analyze:</label>
+                <textarea id="input-text" placeholder="Enter your text here..."></textarea>
+            </div>
+            <button id="analyze-button" class="primary-button">Analyze</button>
+        </div>
+        <div id="output" class="output-panel">
+            <div class="output-section">
+                <h2>Token Analysis</h2>
+                <div id="colored-text" class="token-display"></div>
+            </div>
+            <div class="stats-section">
+                <h2>Statistics</h2>
+                <div class="stats-grid">
+                    <div class="stat-card">
+                        <div class="stat-label">Joint Log-Likelihood</div>
+                        <div class="stat-value" id="joint-log-likelihood">-</div>
+                    </div>
+                    <div class="stat-card">
+                        <div class="stat-label">Average Log-Likelihood</div>
+                        <div class="stat-value" id="average-log-likelihood">-</div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="{{ url_for('static', filename='script.js') }}"></script>
+</body>
+</html>