import os from typing import TypeVar from tqdm import tqdm import gradio as gr import numpy as np import supervision as sv from PIL import Image from rfdetr import RFDETRNano, RFDETRSmall, RFDETRMedium, RFDETRBase, RFDETRLarge from rfdetr.detr import RFDETR from rfdetr.util.coco_classes import COCO_CLASSES from utils.image import calculate_resolution_wh from utils.video import create_directory, generate_unique_name ImageType = TypeVar("ImageType", Image.Image, np.ndarray) MARKDOWN = """ # RF-DETR 🔥 [`[code]`](https://github.com/roboflow/rf-detr) [`[blog]`](https://blog.roboflow.com/rf-detr) [`[notebook]`](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-rf-detr-on-detection-dataset.ipynb) RF-DETR is a real-time, transformer-based object detection model architecture developed by [Roboflow](https://roboflow.com/) and released under the Apache 2.0 license. """ IMAGE_PROCESSING_EXAMPLES = [ ['https://media.roboflow.com/supervision/image-examples/people-walking.png', 0.3, 1024, "medium"], ['https://media.roboflow.com/supervision/image-examples/vehicles.png', 0.3, 1024, "medium"], ['https://media.roboflow.com/supervision/image-examples/motorbike.png', 0.3, 1024, "medium"], ['https://media.roboflow.com/notebooks/examples/dog-2.jpeg', 0.5, 512, "nano"], ['https://media.roboflow.com/notebooks/examples/dog-3.jpeg', 0.5, 512, "nano"], ['https://media.roboflow.com/supervision/image-examples/basketball-1.png', 0.5, 512, "nano"], ] VIDEO_PROCESSING_EXAMPLES = [ ["videos/people-walking.mp4", 0.3, 1024, "medium"], ["videos/vehicles.mp4", 0.3, 1024, "medium"], ] COLOR = sv.ColorPalette.from_hex([ "#ffff00", "#ff9b00", "#ff8080", "#ff66b2", "#ff66ff", "#b266ff", "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00" ]) MAX_VIDEO_LENGTH_SECONDS = 5 VIDEO_SCALE_FACTOR = 0.5 VIDEO_TARGET_DIRECTORY = "tmp" create_directory(directory_path=VIDEO_TARGET_DIRECTORY) def detect_and_annotate( model: RFDETR, image: ImageType, confidence: float, ) -> ImageType: detections = model.predict(image, threshold=confidence) resolution_wh = calculate_resolution_wh(image) text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh) - 0.2 thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh) bbox_annotator = sv.BoxAnnotator(color=COLOR, thickness=thickness) label_annotator = sv.LabelAnnotator( color=COLOR, text_color=sv.Color.BLACK, text_scale=text_scale ) labels = [ f"{COCO_CLASSES[class_id]} {confidence:.2f}" for class_id, confidence in zip(detections.class_id, detections.confidence) ] annotated_image = image.copy() annotated_image = bbox_annotator.annotate(annotated_image, detections) annotated_image = label_annotator.annotate(annotated_image, detections, labels) return annotated_image def load_model(resolution: int, checkpoint: str) -> RFDETR: if checkpoint == "nano": return RFDETRNano(resolution=resolution) if checkpoint == "small": return RFDETRSmall(resolution=resolution) if checkpoint == "medium": return RFDETRMedium(resolution=resolution) if checkpoint == "base": return RFDETRBase(resolution=resolution) elif checkpoint == "large": return RFDETRLarge(resolution=resolution) raise TypeError("Checkpoint must be a base or large.") def adjust_resolution(checkpoint: str, resolution: int) -> int: if checkpoint in {"nano", "small", "medium"}: divisor = 32 elif checkpoint in {"base", "large"}: divisor = 56 else: raise ValueError(f"Unknown checkpoint: {checkpoint}") remainder = resolution % divisor if remainder == 0: return resolution lower = resolution - remainder upper = lower + divisor if resolution - lower < upper - resolution: return lower else: return upper def image_processing_inference( input_image: Image.Image, confidence: float, resolution: int, checkpoint: str ): resolution = adjust_resolution(checkpoint=checkpoint, resolution=resolution) model = load_model(resolution=resolution, checkpoint=checkpoint) return detect_and_annotate(model=model, image=input_image, confidence=confidence) def video_processing_inference( input_video: str, confidence: float, resolution: int, checkpoint: str, ): resolution = adjust_resolution(checkpoint=checkpoint, resolution=resolution) model = load_model(resolution=resolution, checkpoint=checkpoint) name = generate_unique_name() output_video = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4") video_info = sv.VideoInfo.from_video_path(input_video) video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR) video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR) total = min(video_info.total_frames, video_info.fps * MAX_VIDEO_LENGTH_SECONDS) frames_generator = sv.get_video_frames_generator(input_video, end=total) with sv.VideoSink(output_video, video_info=video_info) as sink: for frame in tqdm(frames_generator, total=total): annotated_frame = detect_and_annotate( model=model, image=frame, confidence=confidence, ) annotated_frame = sv.scale_image(annotated_frame, VIDEO_SCALE_FACTOR) sink.write_frame(annotated_frame) return output_video with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Tab("Image"): with gr.Row(): image_processing_input_image = gr.Image( label="Upload image", image_mode='RGB', type='pil', height=600 ) image_processing_output_image = gr.Image( label="Output image", image_mode='RGB', type='pil', height=600 ) with gr.Row(): with gr.Column(): image_processing_confidence_slider = gr.Slider( label="Confidence", minimum=0.0, maximum=1.0, step=0.05, value=0.5, ) image_processing_resolution_slider = gr.Slider( label="Inference resolution", minimum=224, maximum=2240, step=1, value=896, ) image_processing_checkpoint_dropdown = gr.Dropdown( label="Checkpoint", choices=["nano", "small", "medium"], value="medium" ) with gr.Column(): image_processing_submit_button = gr.Button("Submit", value="primary") gr.Examples( fn=image_processing_inference, examples=IMAGE_PROCESSING_EXAMPLES, inputs=[ image_processing_input_image, image_processing_confidence_slider, image_processing_resolution_slider, image_processing_checkpoint_dropdown ], outputs=image_processing_output_image, ) image_processing_submit_button.click( image_processing_inference, inputs=[ image_processing_input_image, image_processing_confidence_slider, image_processing_resolution_slider, image_processing_checkpoint_dropdown ], outputs=image_processing_output_image, ) with gr.Tab("Video"): with gr.Row(): video_processing_input_video = gr.Video( label='Upload video', height=600 ) video_processing_output_video = gr.Video( label='Output video', height=600 ) with gr.Row(): with gr.Column(): video_processing_confidence_slider = gr.Slider( label="Confidence", minimum=0.0, maximum=1.0, step=0.05, value=0.5, ) video_processing_resolution_slider = gr.Slider( label="Inference resolution", minimum=560, maximum=1120, step=56, value=728, ) video_processing_checkpoint_dropdown = gr.Dropdown( label="Checkpoint", choices=["nano", "small", "medium"], value="medium" ) with gr.Column(): video_processing_submit_button = gr.Button("Submit", value="primary") gr.Examples( fn=video_processing_inference, examples=VIDEO_PROCESSING_EXAMPLES, inputs=[ video_processing_input_video, video_processing_confidence_slider, video_processing_resolution_slider, video_processing_checkpoint_dropdown ], outputs=video_processing_output_video ) video_processing_submit_button.click( video_processing_inference, inputs=[ video_processing_input_video, video_processing_confidence_slider, video_processing_resolution_slider, video_processing_checkpoint_dropdown ], outputs=video_processing_output_video ) demo.launch(debug=False, show_error=True)