akhaliq's picture
akhaliq HF Staff
Upload index.js with huggingface_hub
e949992 verified
import {
AutoProcessor,
AutoModelForImageTextToText,
load_image,
TextStreamer,
} from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2';
// Global variables
let processor = null;
let model = null;
let isModelLoaded = false;
let currentDevice = 'cpu';
// DOM elements
const elements = {
imageUrl: document.getElementById('image-url'),
loadUrlBtn: document.getElementById('load-url-btn'),
fileInput: document.getElementById('file-input'),
uploadArea: document.getElementById('upload-area'),
uploadBtn: document.getElementById('upload-btn'),
previewSection: document.getElementById('preview-section'),
previewImage: document.getElementById('preview-image'),
customPrompt: document.getElementById('custom-prompt'),
loadingSection: document.getElementById('loading-section'),
loadingText: document.getElementById('loading-text'),
progressFill: document.getElementById('progress-fill'),
outputSection: document.getElementById('output-section'),
outputContent: document.getElementById('output-content'),
copyBtn: document.getElementById('copy-btn'),
errorSection: document.getElementById('error-section'),
errorMessage: document.getElementById('error-message'),
tabBtns: document.querySelectorAll('.tab-btn'),
tabContents: document.querySelectorAll('.tab-content'),
deviceRadios: document.querySelectorAll('input[name="device"]')
};
// Initialize model
async function initializeModel() {
if (isModelLoaded && currentDevice === getSelectedDevice()) {
return;
}
try {
showLoading('Loading model...');
currentDevice = getSelectedDevice();
const model_id = "onnx-community/FastVLM-0.5B-ONNX";
const modelOptions = {
dtype: {
embed_tokens: "fp16",
vision_encoder: "q4",
decoder_model_merged: "q4",
}
};
if (currentDevice === 'webgpu') {
modelOptions.device = 'webgpu';
}
updateLoadingText('Loading processor...');
processor = await AutoProcessor.from_pretrained(model_id);
updateLoadingText('Loading model...');
model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions);
isModelLoaded = true;
hideLoading();
} catch (error) {
console.error('Model initialization error:', error);
showError('Failed to load model. Please try again.');
hideLoading();
throw error;
}
}
// Generate caption for image
async function generateCaption(imageUrl) {
try {
hideError();
showLoading('Processing image...');
if (!isModelLoaded) {
await initializeModel();
}
// Prepare prompt
const customPromptText = elements.customPrompt.value.trim();
const promptContent = customPromptText || "Describe this image in detail.";
const messages = [
{
role: "user",
content: `<image>${promptContent}`,
},
];
const prompt = processor.apply_chat_template(messages, {
add_generation_prompt: true,
});
updateLoadingText('Loading image...');
const image = await load_image(imageUrl);
updateLoadingText('Processing inputs...');
const inputs = await processor(image, prompt, {
add_special_tokens: false,
});
updateLoadingText('Generating caption...');
elements.outputContent.textContent = '';
showOutput();
const outputs = await model.generate({
...inputs,
max_new_tokens: 512,
do_sample: false,
streamer: new TextStreamer(processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: false,
callback_function: (text) => {
elements.outputContent.textContent += text;
},
}),
});
const decoded = processor.batch_decode(
outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
{ skip_special_tokens: true },
);
elements.outputContent.textContent = decoded[0];
hideLoading();
} catch (error) {
console.error('Caption generation error:', error);
showError('Failed to generate caption. Please check your image URL and try again.');
hideLoading();
}
}
// Helper functions
function getSelectedDevice() {
const selected = document.querySelector('input[name="device"]:checked');
return selected ? selected.value : 'cpu';
}
function showLoading(text) {
elements.loadingSection.style.display = 'block';
elements.loadingText.textContent = text;
elements.progressFill.style.width = '50%';
}
function updateLoadingText(text) {
elements.loadingText.textContent = text;
const progress = {
'Loading processor...': '30%',
'Loading model...': '60%',
'Loading image...': '70%',
'Processing inputs...': '80%',
'Generating caption...': '90%'
};
elements.progressFill.style.width = progress[text] || '50%';
}
function hideLoading() {
elements.loadingSection.style.display = 'none';
elements.progressFill.style.width = '0%';
}
function showOutput() {
elements.outputSection.style.display = 'block';
}
function hideOutput() {
elements.outputSection.style.display = 'none';
}
function showError(message) {
elements.errorSection.style.display = 'block';
elements.errorMessage.textContent = message;
}
function hideError() {
elements.errorSection.style.display = 'none';
}
function showPreview(url) {
elements.previewImage.src = url;
elements.previewSection.style.display = 'block';
}
// Event listeners
elements.loadUrlBtn.addEventListener('click', async () => {
const url = elements.imageUrl.value.trim();
if (!url) {
showError('Please enter a valid image URL');
return;
}
showPreview(url);
await generateCaption(url);
});
elements.uploadArea.addEventListener('click', () => {
elements.fileInput.click();
});
elements.uploadArea.addEventListener('dragover', (e) => {
e.preventDefault();
elements.uploadArea.classList.add('dragover');
});
elements.uploadArea.addEventListener('dragleave', () => {
elements.uploadArea.classList.remove('dragover');
});
elements.uploadArea.addEventListener('drop', (e) => {
e.preventDefault();
elements.uploadArea.classList.remove('dragover');
handleFiles(e.dataTransfer.files);
});
elements.fileInput.addEventListener('change', (e) => {
handleFiles(e.target.files);
});
async function handleFiles(files) {
if (files.length === 0) return;
const file = files[0];
if (!file.type.startsWith('image/')) {
showError('Please select a valid image file');
return;
}
const url = URL.createObjectURL(file);
showPreview(url);
elements.uploadBtn.disabled = false;
elements.uploadBtn.dataset.imageUrl = url;
}
elements.uploadBtn.addEventListener('click', async () => {
const url = elements.uploadBtn.dataset.imageUrl;
if (url) {
await generateCaption(url);
}
});
elements.copyBtn.addEventListener('click', () => {
const text = elements.outputContent.textContent;
navigator.clipboard.writeText(text).then(() => {
elements.copyBtn.textContent = 'Copied!';
setTimeout(() => {
elements.copyBtn.innerHTML = `
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor">
<rect x="9" y="9" width="13" height="13" rx="2" ry="2"></rect>
<path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path>
</svg>
Copy Caption
`;
}, 2000);
});
});
// Tab switching
elements.tabBtns.forEach(btn => {
btn.addEventListener('click', () => {
const targetTab = btn.dataset.tab;
elements.tabBtns.forEach(b => b.classList.remove('active'));
elements.tabContents.forEach(c => c.classList.remove('active'));
btn.classList.add('active');
document.getElementById(`${targetTab}-tab`).classList.add('active');
});
});
// Device selection
elements.deviceRadios.forEach(radio => {
radio.addEventListener('change', () => {
if (isModelLoaded && currentDevice !== getSelectedDevice()) {
isModelLoaded = false;
}
});
});