Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jun 23

Commit

bf11931

1 Parent(s): 07b4d20

vggt

Browse files

Files changed (2) hide show

.gitattributes +1 -0
app.py +312 -4

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,6 +1,314 @@
-import os
 import gradio as gr
-hf_token = os.getenv("HF_TOKEN")
-demo = gr.load(name="Yuxihenry/SpatialTrackerV2_Backend", hf_token=hf_token, src="spaces")
-demo.launch()

 import gradio as gr
+import os
+import json
+import numpy as np
+import cv2
+import base64
+from typing import List, Tuple
+# Backend Space URL - replace with your actual backend space URL
+BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend"  # Replace with actual backend space URL
+hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
+def numpy_to_base64(arr):
+    """Convert numpy array to base64 string"""
+    return base64.b64encode(arr.tobytes()).decode('utf-8')
+def base64_to_numpy(b64_str, shape, dtype):
+    """Convert base64 string back to numpy array"""
+    return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
+def base64_to_image(b64_str):
+    """Convert base64 string to numpy image array"""
+    if not b64_str:
+        return None
+    try:
+        # Decode base64 to bytes
+        img_bytes = base64.b64decode(b64_str)
+        # Convert bytes to numpy array
+        nparr = np.frombuffer(img_bytes, np.uint8)
+        # Decode image
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        # Convert BGR to RGB
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        return img
+    except Exception as e:
+        print(f"Error converting base64 to image: {e}")
+        return None
+def get_video_name(video_path):
+    """Extract video name without extension"""
+    return os.path.splitext(os.path.basename(video_path))[0]
+def handle_video_upload(video):
+    """Handle video upload and extract first frame"""
+    if video is None:
+        return None, None, []
+    try:
+        # Load backend API
+        backend_api = gr.load(f"spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)
+        # Call backend upload API
+        original_image_state, display_image_b64, selected_points, grid_size_val, vo_points_val, fps_val = backend_api.upload_video_api(video)
+        # Convert base64 image back to numpy array
+        display_image = base64_to_image(display_image_b64)
+        return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
+    except Exception as e:
+        print(f"Error in handle_video_upload: {e}")
+        return None, None, [], 50, 756, 3
+def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
+    """Handle point selection for SAM"""
+    if original_img is None:
+        return None, []
+    try:
+        # Load backend API
+        backend_api = gr.load(f"spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)
+        # Call backend select point API
+        display_image_b64, new_sel_pix = backend_api.select_point_api(
+            original_img, sel_pix, point_type, evt.index[0], evt.index[1]
+        )
+        # Convert base64 image back to numpy array
+        display_image = base64_to_image(display_image_b64)
+        return display_image, new_sel_pix
+    except Exception as e:
+        print(f"Error in select_point: {e}")
+        return None, sel_pix
+def reset_points(original_img: str, sel_pix):
+    """Reset all points and clear the mask"""
+    if original_img is None:
+        return None, []
+    try:
+        # Load backend API
+        backend_api = gr.load(f"spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)
+        # Call backend reset points API
+        display_image_b64, new_sel_pix = backend_api.reset_points_api(original_img, sel_pix)
+        # Convert base64 image back to numpy array
+        display_image = base64_to_image(display_image_b64)
+        return display_image, new_sel_pix
+    except Exception as e:
+        print(f"Error in reset_points: {e}")
+        return None, []
+def launch_viz(grid_size, vo_points, fps, original_image_state):
+    """Launch visualization with user-specific temp directory"""
+    if original_image_state is None:
+        return None, None
+    try:
+        # Load backend API
+        backend_api = gr.load(f"spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)
+        # Call backend run tracker API
+        viz_iframe_html, track_video_path = backend_api.run_tracker_api(
+            grid_size, vo_points, fps, original_image_state
+        )
+        return viz_iframe_html, track_video_path
+    except Exception as e:
+        print(f"Error in launch_viz: {e}")
+        return None, None
+def clear_all():
+    """Clear all buffers and temporary files"""
+    return None, None, []
+def update_tracker_model(vo_points):
+    return None  # No output needed
+# Function to handle both manual upload and example selection
+def handle_video_change(video):
+    """Handle video change from both manual upload and example selection"""
+    if video is None:
+        return None, None, [], 50, 756, 3
+    # Handle video upload (extract first frame)
+    original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val = handle_video_upload(video)
+    return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
+# Build UI
+with gr.Blocks(css="""
+    #advanced_settings .wrap {
+        font-size: 14px !important;
+    }
+    #advanced_settings .gr-slider {
+        font-size: 13px !important;
+    }
+    #advanced_settings .gr-slider .gr-label {
+        font-size: 13px !important;
+        margin-bottom: 5px !important;
+    }
+    #advanced_settings .gr-slider .gr-info {
+        font-size: 12px !important;
+    }
+    #point_label_radio .gr-radio-group {
+        flex-direction: row !important;
+        gap: 15px !important;
+    }
+    #point_label_radio .gr-radio-group label {
+        margin-right: 0 !important;
+        margin-bottom: 0 !important;
+    }
+    /* Style for example videos label */
+    .gr-examples .gr-label {
+        font-weight: bold !important;
+        font-size: 16px !important;
+    }
+    /* Simple horizontal scroll for examples */
+    .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+    }
+    .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+    }
+    .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+    }
+    .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 150px !important;
+        margin-right: 10px !important;
+    }
+    .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 5px !important;
+    }
+""") as demo:
+    # Initialize states inside Blocks
+    selected_points = gr.State([])
+    original_image_state = gr.State()  # Store original image in state
+    with gr.Row():
+        gr.Markdown("""
+        # ✨ SpaTrackV2 Frontend (Client)
+        <div style='background-color: #e6f3ff; padding: 20px; border-radius: 10px; margin: 10px 0;'>
+        <h2 style='color: #0066cc; margin-bottom: 15px;'>Instructions:</h2>
+        <ol style='font-size: 20px; line-height: 1.6;'>
+            <li>🎬 Upload a video or select from examples below</li>
+            <li>🎯 Select positive points (green) and negative points (red) on the first frame</li>
+            <li>⚡ Click 'Run Tracker and Visualize' when done</li>
+            <li>🔍 Iterative 3D result will be shown in the visualization</li>
+        </ol>
+        <p style='font-size: 22px;'>❗ This frontend connects to a private backend Space for processing</p>
+        </div>
+        """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            video_input = gr.Video(label="Upload Video", format="mp4", height=300)
+            # Move Interactive Frame and 2D Tracking under video upload
+            with gr.Row():
+                display_image = gr.Image(type="numpy", label="📸 Interactive Frame", height=250)
+                track_video = gr.Video(label="🎯 2D Tracking Result", height=250)
+            with gr.Row():
+                fg_bg_radio = gr.Radio(choices=['positive_point', 'negative_point'],
+                                       label='Point label',
+                                       value='positive_point',
+                                       elem_id="point_label_radio")
+                reset_button = gr.Button("Reset points")
+                clear_button = gr.Button("Clear All", variant="secondary")
+            with gr.Accordion("⚙️ Advanced Settings", open=True, elem_id="advanced_settings"):
+                grid_size = gr.Slider(minimum=10, maximum=100, value=50, step=1,
+                                      label="Grid Size", info="Size of the tracking grid")
+                vo_points = gr.Slider(minimum=256, maximum=4096, value=756, step=50,
+                                      label="VO Points", info="Number of points for solving camera pose")
+                fps_slider = gr.Slider(minimum=1, maximum=10, value=3, step=1,
+                                      label="FPS", info="FPS of the output video")
+            viz_button = gr.Button("🚀 Run Tracker and Visualize", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            # Add example videos using gr.Examples
+            examples_component = gr.Examples(
+                examples=[
+                    "examples/kiss.mp4",
+                    "examples/backpack.mp4",
+                    "examples/pillow.mp4",
+                    "examples/hockey.mp4",
+                    "examples/drifting.mp4",
+                    "examples/ken_block_0.mp4",
+                    "examples/ball.mp4",
+                    "examples/kitchen.mp4",
+                    "examples/ego_teaser.mp4",
+                    "examples/ego_kc1.mp4",
+                    "examples/vertical_place.mp4",
+                    "examples/robot_unitree.mp4",
+                    "examples/droid_robot.mp4",
+                    "examples/robot_2.mp4",
+                    "examples/cinema_0.mp4",
+                ],
+                inputs=[video_input],
+                label="📁 Example Videos",
+                examples_per_page=20  # Show all examples on one page to enable scrolling
+            )
+            # Initialize with the template interface showing "Interactive 3D Tracking"
+            viz_iframe = gr.HTML("""
+                                <div style='border: 3px solid #667eea; border-radius: 10px; overflow: hidden; box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);'>
+                                    <iframe id="viz_iframe" src="/gradio_api/file=_viz/viz_template.html" width="100%" height="950px" style="border:none;"></iframe>
+                                </div>
+                                """)
+            # Simple description below the visualization
+            gr.HTML("""
+            <div style='text-align: center; margin-top: 15px; color: #666; font-size: 14px;'>
+                🎮 Interactive 3D visualization adapted from <a href="https://tapip3d.github.io/" target="_blank" style="color: #667eea;">TAPIP3D</a>
+            </div>
+            """)
+    # Bind events
+    video_input.change(
+        handle_video_change,
+        inputs=[video_input],
+        outputs=[original_image_state, display_image, selected_points, grid_size, vo_points, fps_slider]
+    )
+    reset_button.click(reset_points,
+                     inputs=[original_image_state, selected_points],
+                     outputs=[display_image, selected_points])
+    clear_button.click(clear_all,
+                      outputs=[video_input, display_image, selected_points])
+    display_image.select(select_point,
+                      inputs=[original_image_state, selected_points, fg_bg_radio],
+                      outputs=[display_image, selected_points])
+    # Update tracker model when vo_points changes
+    vo_points.change(update_tracker_model,
+                    inputs=[vo_points],
+                    outputs=[])
+    viz_button.click(launch_viz,
+                    inputs=[grid_size, vo_points, fps_slider, original_image_state],
+                    outputs=[viz_iframe, track_video],
+                    )
+# Launch the demo
+if __name__ == "__main__":
+    demo.launch()