Update index.html
Browse files- index.html +243 -17
index.html
CHANGED
@@ -1,19 +1,245 @@
|
|
1 |
-
<!
|
2 |
<html>
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
</html>
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
<html>
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<title>SmolVLM Benchmark Demo</title>
|
6 |
+
<style>
|
7 |
+
body { font-family: Arial, sans-serif; margin: 20px; }
|
8 |
+
fieldset { margin-bottom: 20px; padding: 10px; }
|
9 |
+
legend { font-weight: bold; }
|
10 |
+
label { display: block; margin-top: 5px; }
|
11 |
+
input, select { margin-bottom: 5px; width: 100%; max-width: 400px; }
|
12 |
+
table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; }
|
13 |
+
th, td { border: 1px solid #ccc; padding: 8px; text-align: left; }
|
14 |
+
button { padding: 10px 20px; }
|
15 |
+
</style>
|
16 |
+
</head>
|
17 |
+
<body>
|
18 |
+
<h1>SmolVLM Benchmark Demo</h1>
|
19 |
+
|
20 |
+
<!-- Model Options -->
|
21 |
+
<fieldset id="model-options">
|
22 |
+
<legend>Model Options</legend>
|
23 |
+
<label for="model-id">Select Model ID:</label>
|
24 |
+
<select id="model-id">
|
25 |
+
<option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option>
|
26 |
+
<option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option>
|
27 |
+
<option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option>
|
28 |
+
<option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option>
|
29 |
+
</select>
|
30 |
+
|
31 |
+
<label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label>
|
32 |
+
<select id="decoder-dtype">
|
33 |
+
<option value="fp32">fp32</option>
|
34 |
+
<option value="fp16">fp16</option>
|
35 |
+
<option value="q8">q8</option>
|
36 |
+
<option value="q4">q4</option>
|
37 |
+
<option value="q4f16">q4f16</option>
|
38 |
+
</select>
|
39 |
+
|
40 |
+
<label for="embed-dtype">Embed Tokens dtype:</label>
|
41 |
+
<select id="embed-dtype">
|
42 |
+
<option value="fp32">fp32</option>
|
43 |
+
<option value="fp16">fp16</option>
|
44 |
+
<option value="q8">q8</option>
|
45 |
+
<option value="q4">q4</option>
|
46 |
+
<option value="q4f16">q4f16</option>
|
47 |
+
</select>
|
48 |
+
|
49 |
+
<label for="vision-dtype">Vision Encoder dtype:</label>
|
50 |
+
<select id="vision-dtype">
|
51 |
+
<option value="fp32">fp32</option>
|
52 |
+
<option value="fp16">fp16</option>
|
53 |
+
<option value="q8">q8</option>
|
54 |
+
<option value="q4">q4</option>
|
55 |
+
<option value="q4f16">q4f16</option>
|
56 |
+
</select>
|
57 |
+
</fieldset>
|
58 |
+
|
59 |
+
<!-- Hardware Options -->
|
60 |
+
<fieldset id="hardware-options">
|
61 |
+
<legend>Hardware Options</legend>
|
62 |
+
<label for="device">Select Device:</label>
|
63 |
+
<select id="device">
|
64 |
+
<option value="wasm">wasm</option>
|
65 |
+
<option value="webgpu" selected>webgpu</option>
|
66 |
+
</select>
|
67 |
+
</fieldset>
|
68 |
+
|
69 |
+
<!-- Benchmark Options -->
|
70 |
+
<fieldset id="benchmark-options">
|
71 |
+
<legend>Benchmark Options</legend>
|
72 |
+
<label for="image-url">Image URL:</label>
|
73 |
+
<input type="text" id="image-url" value="https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg">
|
74 |
+
|
75 |
+
<label for="do-split">Do Image Splitting (do_image_splitting)</label>
|
76 |
+
<input type="checkbox" id="do-split" checked>
|
77 |
+
|
78 |
+
<label for="max-tokens">Number of Tokens to Generate:</label>
|
79 |
+
<input type="number" id="max-tokens" value="128">
|
80 |
+
|
81 |
+
<label for="num-runs">Number of Runs:</label>
|
82 |
+
<input type="number" id="num-runs" value="5">
|
83 |
+
</fieldset>
|
84 |
+
|
85 |
+
<button id="start-benchmark">Start Benchmark</button>
|
86 |
+
|
87 |
+
<div id="results"></div>
|
88 |
+
|
89 |
+
<script type="module">
|
90 |
+
import {
|
91 |
+
AutoProcessor,
|
92 |
+
AutoModelForVision2Seq,
|
93 |
+
load_image,
|
94 |
+
TextStreamer,
|
95 |
+
} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2";
|
96 |
+
|
97 |
+
class SmolVLM {
|
98 |
+
static model = null;
|
99 |
+
static processor = null;
|
100 |
+
static model_id = null;
|
101 |
+
|
102 |
+
static async getInstance(modelId, dtypeSettings, device) {
|
103 |
+
this.processor ??= await AutoProcessor.from_pretrained(modelId);
|
104 |
+
this.model ??= await AutoModelForVision2Seq.from_pretrained(modelId, {
|
105 |
+
dtype: {
|
106 |
+
embed_tokens: dtypeSettings.embed,
|
107 |
+
vision_encoder: dtypeSettings.vision,
|
108 |
+
decoder_model_merged: dtypeSettings.decoder,
|
109 |
+
},
|
110 |
+
device: device,
|
111 |
+
});
|
112 |
+
return [this.processor, this.model];
|
113 |
+
}
|
114 |
+
}
|
115 |
+
|
116 |
+
async function runBenchmark() {
|
117 |
+
document.getElementById("model-options").disabled = true;
|
118 |
+
document.getElementById("hardware-options").disabled = true;
|
119 |
+
|
120 |
+
const resultsDiv = document.getElementById("results");
|
121 |
+
resultsDiv.innerHTML = "<p>Loading model and running benchmark...</p>";
|
122 |
+
|
123 |
+
const modelId = document.getElementById("model-id").value;
|
124 |
+
let decoderDefault = "fp32", embedDefault = "fp32", visionDefault = "fp32";
|
125 |
+
const decoder_dtype = document.getElementById("decoder-dtype").value || decoderDefault;
|
126 |
+
const embed_dtype = document.getElementById("embed-dtype").value || embedDefault;
|
127 |
+
const vision_dtype = document.getElementById("vision-dtype").value || visionDefault;
|
128 |
+
const device = document.getElementById("device").value;
|
129 |
+
const imageUrl = document.getElementById("image-url").value;
|
130 |
+
const maxTokens = parseInt(document.getElementById("max-tokens").value) || 128;
|
131 |
+
const numRuns = parseInt(document.getElementById("num-runs").value) || 5;
|
132 |
+
const doImageSplitting = document.getElementById("do-split").checked;
|
133 |
+
|
134 |
+
document.getElementById("decoder-dtype").value = decoder_dtype;
|
135 |
+
document.getElementById("embed-dtype").value = embed_dtype;
|
136 |
+
document.getElementById("vision-dtype").value = vision_dtype;
|
137 |
+
|
138 |
+
const image = await load_image(imageUrl);
|
139 |
+
const dtypeSettings = {
|
140 |
+
decoder: decoder_dtype,
|
141 |
+
embed: embed_dtype,
|
142 |
+
vision: vision_dtype,
|
143 |
+
};
|
144 |
+
|
145 |
+
// Pre-run warmup (compiling shaders, initialization) with max_new_tokens: 1.
|
146 |
+
try {
|
147 |
+
const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device);
|
148 |
+
const messages = [{
|
149 |
+
role: "user",
|
150 |
+
content: [
|
151 |
+
{ type: "image" },
|
152 |
+
{ type: "text", text: "Can you describe this image?" },
|
153 |
+
],
|
154 |
+
}];
|
155 |
+
const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
|
156 |
+
const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
|
157 |
+
await model.generate({
|
158 |
+
...inputs,
|
159 |
+
max_new_tokens: 1,
|
160 |
+
});
|
161 |
+
} catch (e) {
|
162 |
+
resultsDiv.innerHTML = "<p>Error during warmup: " + e.toString() + "</p>";
|
163 |
+
return;
|
164 |
+
}
|
165 |
+
|
166 |
+
// Benchmark runs using streaming generation.
|
167 |
+
let totalTime = 0;
|
168 |
+
let totalTps = 0;
|
169 |
+
let runsResults = [];
|
170 |
+
|
171 |
+
for (let i = 0; i < numRuns; ++i) {
|
172 |
+
try {
|
173 |
+
const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device);
|
174 |
+
const messages = [{
|
175 |
+
role: "user",
|
176 |
+
content: [
|
177 |
+
{ type: "image" },
|
178 |
+
{ type: "text", text: "Can you describe this image?" },
|
179 |
+
],
|
180 |
+
}];
|
181 |
+
const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
|
182 |
+
const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
|
183 |
+
|
184 |
+
// Initialize streaming measurement variables.
|
185 |
+
let startTime, numTokens = 0, tps = 0;
|
186 |
+
const token_callback_function = () => {
|
187 |
+
// Initialize startTime on the first token.
|
188 |
+
startTime = startTime || performance.now();
|
189 |
+
tps = (numTokens++ / (performance.now() - startTime)) * 1000;
|
190 |
+
};
|
191 |
+
// Optional callback function (here we do nothing with intermediate outputs).
|
192 |
+
const callback_function = (output) => {};
|
193 |
+
|
194 |
+
// Create a new streamer with the callbacks.
|
195 |
+
const streamer = new TextStreamer(processor.tokenizer, {
|
196 |
+
skip_prompt: true,
|
197 |
+
skip_special_tokens: true,
|
198 |
+
callback_function,
|
199 |
+
token_callback_function,
|
200 |
+
});
|
201 |
+
|
202 |
+
// Run generation with the streamer.
|
203 |
+
const generateStartTime = performance.now();
|
204 |
+
await model.generate({
|
205 |
+
...inputs,
|
206 |
+
max_new_tokens: maxTokens,
|
207 |
+
min_new_tokens: maxTokens,
|
208 |
+
streamer,
|
209 |
+
});
|
210 |
+
|
211 |
+
// Calculate elapsed time from when the first token arrived.
|
212 |
+
const endTime = performance.now();
|
213 |
+
const elapsed = endTime - generateStartTime;
|
214 |
+
|
215 |
+
totalTime += elapsed;
|
216 |
+
totalTps += tps;
|
217 |
+
runsResults.push({
|
218 |
+
run: i + 1,
|
219 |
+
time: elapsed.toFixed(2),
|
220 |
+
tps: tps.toFixed(2)
|
221 |
+
});
|
222 |
+
} catch (e) {
|
223 |
+
runsResults.push({ run: i + 1, time: "Error", tps: "Error" });
|
224 |
+
}
|
225 |
+
}
|
226 |
+
|
227 |
+
const avgTime = (totalTime / numRuns).toFixed(2);
|
228 |
+
const avgTps = (totalTps / numRuns).toFixed(2);
|
229 |
+
|
230 |
+
let tableHtml = "<h2>Benchmark Results</h2>";
|
231 |
+
tableHtml += "<table>";
|
232 |
+
tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>";
|
233 |
+
runsResults.forEach(r => {
|
234 |
+
tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`;
|
235 |
+
});
|
236 |
+
tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`;
|
237 |
+
tableHtml += "</table>";
|
238 |
+
|
239 |
+
resultsDiv.innerHTML = tableHtml;
|
240 |
+
}
|
241 |
+
|
242 |
+
document.getElementById("start-benchmark").addEventListener("click", runBenchmark);
|
243 |
+
</script>
|
244 |
+
</body>
|
245 |
</html>
|