Spaces:
Running
Running
import * as React from "react"; | |
import { useState, useRef, useEffect } from "react"; | |
import { useVLMContext } from "../context/useVLMContext"; | |
import { drawBoundingBoxesOnCanvas } from "./BoxAnnotator"; | |
const MODES = ["File"] as const; | |
type Mode = typeof MODES[number]; | |
const EXAMPLE_VIDEO_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/sample.mp4"; | |
const EXAMPLE_PROMPT = "Describe the video"; | |
function isImageFile(file: File) { | |
return file.type.startsWith("image/"); | |
} | |
function isVideoFile(file: File) { | |
return file.type.startsWith("video/"); | |
} | |
function denormalizeBox(box: number[], width: number, height: number) { | |
// If all values are between 0 and 1, treat as normalized | |
if (box.length === 4 && box.every(v => v >= 0 && v <= 1)) { | |
return [ | |
box[0] * width, | |
box[1] * height, | |
box[2] * width, | |
box[3] * height | |
]; | |
} | |
return box; | |
} | |
// Add this robust fallback parser near the top | |
function extractAllBoundingBoxes(output: string): { label: string, bbox_2d: number[] }[] { | |
// Try to parse as JSON first | |
try { | |
const parsed = JSON.parse(output); | |
if (Array.isArray(parsed)) { | |
const result: { label: string, bbox_2d: number[] }[] = []; | |
for (const obj of parsed) { | |
if (obj && obj.label && Array.isArray(obj.bbox_2d)) { | |
if (Array.isArray(obj.bbox_2d[0])) { | |
for (const arr of obj.bbox_2d) { | |
if (Array.isArray(arr) && arr.length === 4) { | |
result.push({ label: obj.label, bbox_2d: arr }); | |
} | |
} | |
} else if (obj.bbox_2d.length === 4) { | |
result.push({ label: obj.label, bbox_2d: obj.bbox_2d }); | |
} | |
} | |
} | |
if (result.length > 0) return result; | |
} | |
} catch (e) {} | |
// Fallback: extract all [x1, y1, x2, y2] arrays from the string | |
const boxRegex = /\[\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*,\s*([0-9.]+)\s*\]/g; | |
const boxes: { label: string, bbox_2d: number[] }[] = []; | |
let match; | |
while ((match = boxRegex.exec(output)) !== null) { | |
const arr = [parseFloat(match[1]), parseFloat(match[2]), parseFloat(match[3]), parseFloat(match[4])]; | |
boxes.push({ label: '', bbox_2d: arr }); | |
} | |
return boxes; | |
} | |
// NOTE: You must install onnxruntime-web: | |
// npm install onnxruntime-web | |
// @ts-ignore | |
import * as ort from 'onnxruntime-web'; | |
// If you still get type errors, add a global.d.ts with: declare module 'onnxruntime-web'; | |
// Set your YOLOv8 ONNX model URL here: | |
const YOLOV8_ONNX_URL = "https://huggingface.co/Quazim0t0/yolov8-onnx/resolve/main/yolov8n.onnx"; // <-- PUT YOUR ONNX FILE URL HERE | |
// Add these constants to match the YOLOv8 input size | |
const YOLOV8_INPUT_WIDTH = 640; | |
const YOLOV8_INPUT_HEIGHT = 480; | |
// 1. Load the ONNX model once | |
let yoloSession: ort.InferenceSession | null = null; | |
// Add a busy flag to prevent concurrent YOLOv8 inferences | |
let isYoloBusy = false; | |
async function loadYoloModel() { | |
if (!yoloSession) { | |
yoloSession = await ort.InferenceSession.create(YOLOV8_ONNX_URL); | |
} | |
return yoloSession; | |
} | |
// COCO class names for YOLOv8 | |
const YOLO_CLASSES: string[] = [ | |
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", | |
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", | |
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", | |
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", | |
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", | |
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", | |
"dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", | |
"toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" | |
]; | |
// Preprocess video frame to YOLOv8 input tensor [1,3,640,640] | |
function preprocessFrameToTensor(video: HTMLVideoElement): ort.Tensor { | |
const width = 640; | |
const height = 480; | |
const canvas = document.createElement('canvas'); | |
canvas.width = width; | |
canvas.height = height; | |
const ctx = canvas.getContext('2d'); | |
if (!ctx) throw new Error('Could not get 2D context'); | |
ctx.drawImage(video, 0, 0, width, height); | |
const imageData = ctx.getImageData(0, 0, width, height); | |
const { data } = imageData; | |
// Convert to Float32Array [1,3,480,640], normalize to [0,1] | |
const floatData = new Float32Array(1 * 3 * height * width); | |
for (let i = 0; i < width * height; i++) { | |
floatData[i] = data[i * 4] / 255; // R | |
floatData[i + width * height] = data[i * 4 + 1] / 255; // G | |
floatData[i + 2 * width * height] = data[i * 4 + 2] / 255; // B | |
} | |
return new ort.Tensor('float32', floatData, [1, 3, height, width]); | |
} | |
// Update postprocessYoloOutput to remove unused inputWidth and inputHeight parameters | |
function postprocessYoloOutput(output: ort.Tensor) { | |
// output.dims: [1, num_detections, 6] | |
const data = output.data; | |
const numDetections = output.dims[1]; | |
const results = []; | |
for (let i = 0; i < numDetections; i++) { | |
const offset = i * 6; | |
const x1 = data[offset]; | |
const y1 = data[offset + 1]; | |
const x2 = data[offset + 2]; | |
const y2 = data[offset + 3]; | |
const score = data[offset + 4]; | |
const classId = data[offset + 5]; | |
if (score < 0.2) continue; // adjust threshold as needed | |
results.push({ | |
bbox: [x1, y1, x2, y2], | |
label: YOLO_CLASSES[classId] || `class_${classId}`, | |
score | |
}); | |
} | |
return results; | |
} | |
// Helper type guard for annotation | |
function hasAnnotation(obj: any): obj is { annotation: string } { | |
return typeof obj === 'object' && obj !== null && 'annotation' in obj && typeof obj.annotation === 'string'; | |
} | |
export default function MultiSourceCaptioningView() { | |
const [mode, setMode] = useState<Mode>("File"); | |
const [videoUrl] = useState<string>(EXAMPLE_VIDEO_URL); | |
const [prompt, setPrompt] = useState<string>(EXAMPLE_PROMPT); | |
const [processing, setProcessing] = useState(false); | |
const [error, setError] = useState<string | null>(null); | |
const [uploadedFile, setUploadedFile] = useState<File | null>(null); | |
const [uploadedUrl, setUploadedUrl] = useState<string>(""); | |
const [videoProcessing, setVideoProcessing] = useState(false); | |
const [imageProcessed, setImageProcessed] = useState(false); | |
const [exampleProcessing, setExampleProcessing] = useState(false); | |
const [debugOutput, setDebugOutput] = useState<string>(""); | |
const [canvasDims, setCanvasDims] = useState<{w:number,h:number}|null>(null); | |
const [videoDims, setVideoDims] = useState<{w:number,h:number}|null>(null); | |
const [inferenceStatus, setInferenceStatus] = useState<string>(""); | |
const [showProcessingVideo, setShowProcessingVideo] = useState(false); | |
const videoRef = useRef<HTMLVideoElement | null>(null); | |
const overlayVideoRef = useRef<HTMLVideoElement | null>(null); | |
const processingVideoRef = useRef<HTMLVideoElement | null>(null); | |
const canvasRef = useRef<HTMLCanvasElement | null>(null); | |
const imageRef = useRef<HTMLImageElement | null>(null); | |
const boxHistoryRef = useRef<any[]>([]); | |
// Add a ref to store the latest YOLOv8 results (with optional FastVLM annotation) | |
const lastYoloBoxesRef = React.useRef<any[]>([]); | |
const { isLoaded, isLoading, error: modelError, runInference } = useVLMContext(); | |
// Remove videoProcessingRef and exampleProcessingRef | |
// Add a single processingLoopRef | |
const processingLoopRef = React.useRef(false); | |
const processVideoLoop = async () => { | |
if (!processingLoopRef.current) return; | |
if (isYoloBusy) { | |
// Optionally log: "Inference already running, skipping frame" | |
requestAnimationFrame(processVideoLoop); | |
return; | |
} | |
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop | |
// Schedule the next frame as soon as possible | |
requestAnimationFrame(processVideoLoop); | |
}; | |
const processExampleLoop = async () => { | |
while (processingLoopRef.current) { | |
await yoloDetectionLoop(); // Replaced processVideoFrame with yoloDetectionLoop | |
await new Promise(res => setTimeout(res, 1000)); | |
} | |
}; | |
// Set your YOLOv8 ONNX backend API endpoint here: | |
// const YOLOV8_API_URL = "https://YOUR_YOLOV8_BACKEND_URL_HERE/detect"; // <-- PUT YOUR ENDPOINT HERE | |
// Add this useEffect for overlay video synchronization | |
useEffect(() => { | |
const main = videoRef.current; | |
const overlay = overlayVideoRef.current; | |
if (!main || !overlay) return; | |
// Sync play/pause | |
const onPlay = () => { if (overlay.paused) overlay.play(); }; | |
const onPause = () => { if (!overlay.paused) overlay.pause(); }; | |
// Sync seeking and time | |
const onSeekOrTime = () => { | |
if (Math.abs(main.currentTime - overlay.currentTime) > 0.05) { | |
overlay.currentTime = main.currentTime; | |
} | |
}; | |
main.addEventListener('play', onPlay); | |
main.addEventListener('pause', onPause); | |
main.addEventListener('seeked', onSeekOrTime); | |
main.addEventListener('timeupdate', onSeekOrTime); | |
// Clean up | |
return () => { | |
main.removeEventListener('play', onPlay); | |
main.removeEventListener('pause', onPause); | |
main.removeEventListener('seeked', onSeekOrTime); | |
main.removeEventListener('timeupdate', onSeekOrTime); | |
}; | |
}, [videoRef, overlayVideoRef, uploadedUrl, videoUrl, mode]); | |
useEffect(() => { | |
if ((mode === "File") && processingVideoRef.current) { | |
processingVideoRef.current.play().catch(() => {}); | |
} | |
}, [mode, videoUrl, uploadedUrl]); | |
// Remove old prompt-based box extraction logic and only use the above for video frames. | |
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => { | |
const file = e.target.files?.[0] || null; | |
setUploadedFile(file); | |
setUploadedUrl(file ? URL.createObjectURL(file) : ""); | |
setError(null); | |
setImageProcessed(false); | |
setVideoProcessing(false); | |
setExampleProcessing(false); | |
}; | |
// Webcam mode: process frames with setInterval | |
useEffect(() => { | |
if (mode !== "File" || !isLoaded || !uploadedFile || !isVideoFile(uploadedFile) || !videoProcessing) return; | |
processVideoLoop(); | |
}, [mode, isLoaded, prompt, runInference, uploadedFile, videoProcessing]); | |
// Example video mode: process frames with setInterval | |
useEffect(() => { | |
if (mode !== "File" || uploadedFile || !isLoaded || !exampleProcessing) return; | |
processExampleLoop(); | |
}, [mode, isLoaded, prompt, runInference, uploadedFile, exampleProcessing]); | |
// File mode: process uploaded image (only on button click) | |
const handleProcessImage = async () => { | |
if (!isLoaded || !uploadedFile || !isImageFile(uploadedFile) || !imageRef.current || !canvasRef.current) return; | |
const img = imageRef.current; | |
const canvas = canvasRef.current; | |
canvas.width = img.naturalWidth; | |
canvas.height = img.naturalHeight; | |
setCanvasDims({w:canvas.width,h:canvas.height}); | |
setVideoDims({w:img.naturalWidth,h:img.naturalHeight}); | |
const ctx = canvas.getContext("2d"); | |
if (!ctx) return; | |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height); | |
setProcessing(true); | |
setError(null); | |
setInferenceStatus("Running inference..."); | |
await runInference(img, prompt, (output: string) => { | |
setDebugOutput(output); | |
setInferenceStatus("Inference complete."); | |
ctx.drawImage(img, 0, 0, canvas.width, canvas.height); | |
let boxes = extractAllBoundingBoxes(output); | |
console.log("Model output:", output); | |
console.log("Boxes after normalization:", boxes); | |
console.log("Canvas size:", canvas.width, canvas.height); | |
if (boxes.length > 0) { | |
const [x1, y1, x2, y2] = boxes[0].bbox_2d; | |
console.log("First box coords:", x1, y1, x2, y2); | |
} | |
if (boxes.length === 0) setInferenceStatus("No boxes detected or model output invalid."); | |
if (Array.isArray(boxes) && boxes.length > 0) { | |
const scaleX = canvas.width / img.naturalWidth; | |
const scaleY = canvas.height / img.naturalHeight; | |
drawBoundingBoxesOnCanvas(ctx, boxes, { scaleX, scaleY }); | |
} | |
setImageProcessed(true); | |
}); | |
setProcessing(false); | |
}; | |
// File mode: process uploaded video frames (start/stop) | |
const handleToggleVideoProcessing = () => { | |
setVideoProcessing((prev: boolean) => { | |
const next = !prev; | |
// Always stop all loops before starting | |
processingLoopRef.current = false; | |
setTimeout(() => { | |
if (next) { | |
processingLoopRef.current = true; | |
processVideoLoop(); | |
} | |
}, 50); | |
return next; | |
}); | |
}; | |
// Handle start/stop for example video processing | |
const handleToggleExampleProcessing = () => { | |
setExampleProcessing((prev: boolean) => { | |
const next = !prev; | |
// Always stop all loops before starting | |
processingLoopRef.current = false; | |
setTimeout(() => { | |
if (next) { | |
processingLoopRef.current = true; | |
processVideoLoop(); | |
} | |
}, 50); | |
return next; | |
}); | |
}; | |
// Test draw box function | |
const handleTestDrawBox = () => { | |
if (!canvasRef.current) return; | |
const canvas = canvasRef.current; | |
const ctx = canvas.getContext("2d"); | |
if (!ctx) return; | |
ctx.clearRect(0, 0, canvas.width, canvas.height); | |
ctx.strokeStyle = "#FF00FF"; | |
ctx.lineWidth = 4; | |
ctx.strokeRect(40, 40, Math.max(40,canvas.width/4), Math.max(40,canvas.height/4)); | |
ctx.font = "20px Arial"; | |
ctx.fillStyle = "#FF00FF"; | |
ctx.fillText("Test Box", 50, 35); | |
}; | |
useEffect(() => { | |
const draw = () => { | |
const overlayVideo = overlayVideoRef.current; | |
const canvas = canvasRef.current; | |
if (!overlayVideo || !canvas) return; | |
const displayWidth = overlayVideo.clientWidth; | |
const displayHeight = overlayVideo.clientHeight; | |
canvas.width = displayWidth; | |
canvas.height = displayHeight; | |
const ctx = canvas.getContext("2d"); | |
if (!ctx) return; | |
ctx.clearRect(0, 0, canvas.width, canvas.height); | |
const now = Date.now(); | |
const boxHistory = boxHistoryRef.current.filter((b: any) => now - b.timestamp < 2000); | |
if (boxHistory.length > 0) { | |
// Fix: Draw all boxes, even if bbox_2d is an array of arrays | |
const denormalizedBoxes: any[] = []; | |
for (const b of boxHistory) { | |
if (Array.isArray(b.bbox_2d) && Array.isArray(b.bbox_2d[0])) { | |
// Multiple boxes per label | |
for (const arr of b.bbox_2d) { | |
if (Array.isArray(arr) && arr.length === 4) { | |
denormalizedBoxes.push({ | |
...b, | |
bbox_2d: denormalizeBox(arr, displayWidth, displayHeight) | |
}); | |
} | |
} | |
} else if (Array.isArray(b.bbox_2d) && b.bbox_2d.length === 4) { | |
// Single box | |
denormalizedBoxes.push({ | |
...b, | |
bbox_2d: denormalizeBox(b.bbox_2d, displayWidth, displayHeight) | |
}); | |
} | |
} | |
drawBoundingBoxesOnCanvas(ctx, denormalizedBoxes, { color: "#FF00FF", lineWidth: 4, font: "20px Arial", scaleX: 1, scaleY: 1 }); | |
} | |
}; | |
draw(); | |
const interval = setInterval(draw, 100); | |
// Redraw on window resize | |
const handleResize = () => draw(); | |
window.addEventListener('resize', handleResize); | |
return () => { | |
clearInterval(interval); | |
window.removeEventListener('resize', handleResize); | |
}; | |
}, [overlayVideoRef, canvasRef]); | |
// Drawing loop: draws the latest YOLOv8 boxes every frame | |
React.useEffect(() => { | |
let running = true; | |
function drawLoop() { | |
if (!running) return; | |
const overlayVideo = overlayVideoRef.current; | |
const canvas = canvasRef.current; | |
const processingVideo = processingVideoRef.current; | |
if (canvas && overlayVideo && processingVideo) { | |
// Set canvas size to match the visible video | |
canvas.width = overlayVideo.clientWidth; | |
canvas.height = overlayVideo.clientHeight; | |
const ctx = canvas.getContext('2d'); | |
if (ctx) { | |
ctx.clearRect(0, 0, canvas.width, canvas.height); | |
// Draw all YOLOv8 boxes from last detection | |
const yoloBoxes = lastYoloBoxesRef.current; | |
yoloBoxes.forEach((obj: any) => { | |
// Scale from YOLOv8 input size to canvas size | |
const scaleX = canvas.width / YOLOV8_INPUT_WIDTH; | |
const scaleY = canvas.height / YOLOV8_INPUT_HEIGHT; | |
const [x1, y1, x2, y2] = obj.bbox; | |
const drawX = x1 * scaleX; | |
const drawY = y1 * scaleY; | |
const drawW = (x2 - x1) * scaleX; | |
const drawH = (y2 - y1) * scaleY; | |
ctx.strokeStyle = '#00FFFF'; | |
ctx.lineWidth = 5; | |
ctx.strokeRect(drawX, drawY, drawW, drawH); | |
ctx.font = 'bold 22px Arial'; | |
// Draw YOLOv8 label and confidence | |
const yoloLabel = obj.label || ''; | |
const yoloScore = obj.score !== undefined ? ` ${(obj.score * 100).toFixed(1)}%` : ''; | |
const yoloText = `${yoloLabel}${yoloScore}`; | |
ctx.fillStyle = 'rgba(0,0,0,0.7)'; | |
const yoloTextWidth = ctx.measureText(yoloText).width + 8; | |
ctx.fillRect(drawX - 4, drawY - 24, yoloTextWidth, 26); | |
ctx.fillStyle = '#00FFFF'; | |
ctx.fillText(yoloText, drawX, drawY - 4); | |
// Draw FastVLM annotation below the box if available | |
if (hasAnnotation(obj)) { | |
ctx.font = 'bold 18px Arial'; | |
ctx.fillStyle = 'rgba(0,0,0,0.7)'; | |
const annTextWidth = ctx.measureText(obj.annotation).width + 8; | |
ctx.fillRect(drawX - 4, drawY + drawH + 4, annTextWidth, 24); | |
ctx.fillStyle = '#00FFFF'; | |
ctx.fillText(obj.annotation, drawX, drawY + drawH + 22); | |
} | |
}); | |
} | |
} | |
requestAnimationFrame(drawLoop); | |
} | |
drawLoop(); | |
return () => { running = false; }; | |
}, [overlayVideoRef, canvasRef, processingVideoRef]); | |
// YOLOv8 detection loop: runs as fast as possible, updates lastYoloBoxesRef, and triggers FastVLM annotation in the background | |
const yoloDetectionLoop = async () => { | |
if (!processingLoopRef.current) return; | |
if (isYoloBusy) { | |
requestAnimationFrame(yoloDetectionLoop); | |
return; | |
} | |
isYoloBusy = true; | |
try { | |
const processingVideo = processingVideoRef.current; | |
if (!processingVideo || processingVideo.paused || processingVideo.ended || processingVideo.videoWidth === 0) { | |
isYoloBusy = false; | |
requestAnimationFrame(yoloDetectionLoop); | |
return; | |
} | |
// Run YOLOv8 detection | |
const session = await loadYoloModel(); | |
const inputTensor = preprocessFrameToTensor(processingVideo); | |
const feeds: Record<string, ort.Tensor> = {}; | |
feeds[session.inputNames[0]] = inputTensor; | |
const results = await session.run(feeds); | |
const output = results[session.outputNames[0]]; | |
const detections = postprocessYoloOutput(output); | |
lastYoloBoxesRef.current = detections; | |
// Run FastVLM on the full frame (wait for YOLOv8 to finish) | |
await runInference(processingVideo, prompt, (output: string) => { | |
setDebugOutput(output); | |
}); | |
} catch (err) { | |
console.error('YOLOv8+FastVLM error:', err); | |
} finally { | |
isYoloBusy = false; | |
requestAnimationFrame(yoloDetectionLoop); | |
} | |
}; | |
// Add this effect after the processing loop and toggle handlers | |
useEffect(() => { | |
// Stop processing loop on video source change or processing toggle | |
processingLoopRef.current = false; | |
// Start processing loop for the correct video after refs update | |
setTimeout(() => { | |
if (videoProcessing && uploadedFile && isVideoFile(uploadedFile)) { | |
processingLoopRef.current = true; | |
yoloDetectionLoop(); | |
} else if (exampleProcessing && !uploadedFile) { | |
processingLoopRef.current = true; | |
yoloDetectionLoop(); | |
} | |
}, 100); | |
// eslint-disable-next-line | |
}, [uploadedFile, videoProcessing, exampleProcessing]); | |
return ( | |
<div className="absolute inset-0 text-white"> | |
<div className="fixed top-0 left-0 w-full bg-gray-900 text-white text-center py-2 z-50"> | |
{isLoading ? "Loading model..." : isLoaded ? "Model loaded" : modelError ? `Model error: ${modelError}` : "Model not loaded"} | |
</div> | |
<div className="text-center text-sm text-blue-300 mt-2">{inferenceStatus}</div> | |
<div className="flex flex-col items-center justify-center h-full w-full"> | |
{/* Mode Selector */} | |
<div className="mb-6"> | |
<div className="flex space-x-4"> | |
{MODES.map((m) => ( | |
<button | |
key={m} | |
className={`px-6 py-2 rounded-lg font-semibold transition-all duration-200 ${ | |
mode === m ? "bg-blue-600 text-white" : "bg-gray-700 text-gray-300 hover:bg-blue-500" | |
}`} | |
onClick={() => setMode(m)} | |
> | |
{m} | |
</button> | |
))} | |
</div> | |
</div> | |
{/* Mode Content */} | |
<div className="w-full max-w-2xl flex-1 flex flex-col items-center justify-center"> | |
{mode === "File" && ( | |
<div className="w-full text-center flex flex-col items-center"> | |
<div className="mb-4 w-full max-w-xl"> | |
<label className="block text-left mb-2 font-medium">Detection Prompt:</label> | |
<textarea | |
className="w-full p-2 rounded-lg text-black" | |
rows={3} | |
value={prompt} | |
onChange={(e) => setPrompt(e.target.value)} | |
/> | |
</div> | |
<div className="mb-4 w-full max-w-xl"> | |
<input | |
type="file" | |
accept="image/*,video/*" | |
onChange={handleFileChange} | |
className="block w-full text-sm text-gray-300 file:mr-4 file:py-2 file:px-4 file:rounded-lg file:border-0 file:text-sm file:font-semibold file:bg-blue-600 file:text-white hover:file:bg-blue-700" | |
/> | |
</div> | |
{/* Add toggle button above video area */} | |
<div className="mb-2 w-full max-w-xl flex justify-end"> | |
<button | |
className={`px-4 py-1 rounded bg-gray-700 text-white text-xs font-semibold ${showProcessingVideo ? 'bg-blue-600' : ''}`} | |
onClick={() => setShowProcessingVideo(v => !v)} | |
type="button" | |
> | |
{showProcessingVideo ? 'Hide' : 'Show'} Processed Video | |
</button> | |
</div> | |
{/* Show uploaded image */} | |
{uploadedFile && isImageFile(uploadedFile) && ( | |
<div className="relative w-full max-w-xl"> | |
<img | |
ref={imageRef} | |
src={uploadedUrl} | |
alt="Uploaded" | |
className="w-full rounded-lg shadow-lg mb-2" | |
style={{ background: "#222" }} | |
/> | |
<canvas | |
ref={canvasRef} | |
className="absolute top-0 left-0 w-full h-full pointer-events-none" | |
style={{ zIndex: 10, pointerEvents: "none" }} | |
/> | |
<button | |
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold" | |
onClick={handleProcessImage} | |
disabled={processing} | |
> | |
{processing ? "Processing..." : imageProcessed ? "Reprocess Image" : "Process Image"} | |
</button> | |
</div> | |
)} | |
{/* Show uploaded video */} | |
{uploadedFile && isVideoFile(uploadedFile) && ( | |
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}> | |
{/* Visible overlay video for user */} | |
<video | |
ref={overlayVideoRef} | |
src={uploadedUrl} | |
controls | |
autoPlay | |
loop | |
muted | |
playsInline | |
className="w-full rounded-lg shadow-lg mb-2" | |
style={{ background: "#222", display: "block" }} | |
crossOrigin="anonymous" | |
onLoadedMetadata={(e: React.SyntheticEvent<HTMLVideoElement, Event>) => { | |
if (canvasRef.current) { | |
canvasRef.current.width = e.currentTarget.clientWidth; | |
canvasRef.current.height = e.currentTarget.clientHeight; | |
} | |
}} | |
onResize={() => { | |
if (canvasRef.current && overlayVideoRef.current) { | |
canvasRef.current.width = overlayVideoRef.current.clientWidth; | |
canvasRef.current.height = overlayVideoRef.current.clientHeight; | |
} | |
}} | |
/> | |
{/* Canvas overlay */} | |
<canvas | |
ref={canvasRef} | |
style={{ | |
position: "absolute", | |
top: 0, | |
left: 0, | |
width: "100%", | |
height: "100%", | |
zIndex: 100, | |
pointerEvents: "none", | |
display: "block" | |
}} | |
width={overlayVideoRef.current?.clientWidth || 640} | |
height={overlayVideoRef.current?.clientHeight || 480} | |
/> | |
{/* Hidden or visible processing video for FastVLM/canvas */} | |
<video | |
ref={processingVideoRef} | |
src={uploadedUrl} | |
autoPlay | |
loop | |
muted | |
playsInline | |
crossOrigin="anonymous" | |
style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }} | |
onLoadedData={e => { e.currentTarget.play().catch(() => {}); }} | |
/> | |
<button | |
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold" | |
onClick={handleToggleVideoProcessing} | |
> | |
{videoProcessing ? "Stop Processing" : "Start Processing"} | |
</button> | |
</div> | |
)} | |
{/* Show example video if no file uploaded */} | |
{!uploadedFile && ( | |
<div className="relative w-full max-w-xl" style={{ position: 'relative' }}> | |
{/* Visible overlay video for user */} | |
<video | |
ref={overlayVideoRef} | |
src={EXAMPLE_VIDEO_URL} | |
controls | |
autoPlay | |
loop | |
muted | |
playsInline | |
className="w-full rounded-lg shadow-lg mb-2" | |
style={{ background: "#222", display: "block" }} | |
crossOrigin="anonymous" | |
/> | |
{/* Canvas overlay */} | |
<canvas | |
ref={canvasRef} | |
style={{ | |
position: "absolute", | |
top: 0, | |
left: 0, | |
width: "100%", | |
height: "100%", | |
zIndex: 100, | |
pointerEvents: "none", | |
display: "block" | |
}} | |
width={overlayVideoRef.current?.clientWidth || 640} | |
height={overlayVideoRef.current?.clientHeight || 480} | |
/> | |
{/* Hidden or visible processing video for FastVLM/canvas */} | |
<video | |
ref={processingVideoRef} | |
src={EXAMPLE_VIDEO_URL} | |
autoPlay | |
loop | |
muted | |
playsInline | |
crossOrigin="anonymous" | |
style={{ display: showProcessingVideo ? "block" : "none", width: "100%", marginTop: 8, borderRadius: 8, boxShadow: '0 2px 8px #0004' }} | |
onLoadedData={e => { e.currentTarget.play().catch(() => {}); }} | |
/> | |
<button | |
className="mt-4 px-6 py-2 rounded-lg bg-blue-600 text-white font-semibold" | |
onClick={handleToggleExampleProcessing} | |
> | |
{exampleProcessing ? "Stop Processing" : "Start Processing"} | |
</button> | |
</div> | |
)} | |
{processing && <div className="text-blue-400 mt-2">Processing frame...</div>} | |
{error && <div className="text-red-400 mt-2">Error: {error}</div>} | |
<button | |
className="mt-4 px-6 py-2 rounded-lg bg-gray-600 text-white font-semibold" | |
onClick={handleTestDrawBox} | |
> | |
Test Draw Box | |
</button> | |
<div className="mt-2 p-2 bg-gray-800 rounded text-xs"> | |
<div>Canvas: {canvasDims ? `${canvasDims.w}x${canvasDims.h}` : "-"} | Video: {videoDims ? `${videoDims.w}x${videoDims.h}` : "-"}</div> | |
<div>Raw Model Output:</div> | |
<pre className="overflow-x-auto max-h-32 whitespace-pre-wrap">{debugOutput}</pre> | |
</div> | |
</div> | |
)} | |
</div> | |
</div> | |
</div> | |
); | |
} |