Spaces:

Laudando-Associates-LLC
/

d-fine-demo

Sleeping

File size: 5,651 Bytes

import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image, ImageDraw, ImageFont
import logging
from datasets import load_dataset

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DFineDemo:
    def __init__(self):
        self.processor = AutoProcessor.from_pretrained("Laudando-Associates-LLC/d-fine", trust_remote_code=True)

        self.model_variants = {
            "D-FINE Nano": "Laudando-Associates-LLC/d-fine-nano",
            "D-FINE Small": "Laudando-Associates-LLC/d-fine-small",
            "D-FINE Medium": "Laudando-Associates-LLC/d-fine-medium",
            "D-FINE Large": "Laudando-Associates-LLC/d-fine-large",
            "D-FINE X-Large": "Laudando-Associates-LLC/d-fine-xlarge"
        }

        logger.info("Loading all D-FINE model variants into memory...")
        self.models = {
            name: AutoModel.from_pretrained(repo, trust_remote_code=True)
            for name, repo in self.model_variants.items()
        }

        dataset = load_dataset("Laudando-Associates-LLC/pucks", split="test")
        self.image_cache = {
            f"Test Image {i+1}": {
                "input": example["image"],
                "annotated": example["annotated_image"]
            }
            for i, example in enumerate(dataset)
        }
        self.image_labels = list(self.image_cache.keys())

    def run_inference(self, input_image, model_name, threshold):

        # Find matching annotated image based on value in self.image_cache
        for label, pair in self.image_cache.items():
            if pair["input"] == input_image:
                annotated = pair["annotated"]
                break
        else:
            annotated = input_image  # fallback

        # Predict
        image = input_image.copy()
        inputs = self.processor(image)
        outputs = self.models[model_name](**inputs, conf_threshold=threshold)

        draw = ImageDraw.Draw(image)
        font = ImageFont.truetype("DejaVuSans-Bold.ttf", size=24)
        for result in outputs:
            for box, score in zip(result["boxes"], result["scores"]):
                x1, y1, x2, y2 = box.tolist()
                draw.rectangle([x1, y1, x2, y2], outline="blue", width=5)
                draw.text((x1, max(0, y1 - 25)), f"{score:.2f}", fill="blue", font=font)

        # Return: (annotated_image, predicted_image)
        return gr.update(value=(annotated, image), slider_position=50, format="png", type="pil")

    def select_image(self, evt: gr.SelectData):
        if evt is None or evt.index is None:
            return gr.update()
        label = self.image_labels[evt.index]
        return self.image_cache[label]["input"]

    def launch(self):
        with gr.Blocks(theme=gr.themes.Ocean()) as demo:
            gr.Markdown("""
            ## D-FINE Detection Demo

            This demo compares annotated ground truth data (in **red**) and model predictions (in **blue**).  
            Use the **slider** to visually compare both views:  
            - The **left image** shows the annotated labels.  
            - The **right image** displays predictions from the selected D-FINE model, with each bounding box and its confidence score.

            📂 **Training Dataset**: All D-FINE variants were trained on the [L&A Pucks Dataset](https://huggingface.co/datasets/Laudando-Associates-LLC/pucks) available on Hugging Face.
            """)

            output = gr.ImageSlider(type="pil", label="Detected Output", height=500, width=880, slider_position=50, format="png")

            with gr.Row():
                model_selector = gr.Radio(
                    choices=list(self.model_variants.keys()),
                    label="Choose D-FINE model",
                    value="D-FINE Nano"
                )
                threshold_slider = gr.Slider(
                    minimum=0.1,
                    maximum=0.955,
                    value=0.4,
                    step=0.05,
                    label="Confidence Threshold"
                )
                run_btn = gr.Button("Run Detection")

            selected_image = gr.State(value=self.image_cache[self.image_labels[0]])

            gr.Markdown("### Select a sample image below:")

            gallery = gr.Gallery(
                value=[(pair["input"], label) for label, pair in self.image_cache.items()],
                label=None,
                show_label=False,
                columns=[3],
                object_fit="cover",
                height="auto",
                allow_preview=False
            )

            gallery.select(
                fn=self.select_image,
                inputs=[],
                outputs=selected_image
            )

            run_btn.click(
                fn=self.run_inference,
                inputs=[selected_image, model_selector, threshold_slider],
                outputs=output
            )

            gr.Markdown("### Citation")

            gr.Markdown("""
            If you use **D-FINE** or its methods in your work, please cite the following BibTeX entry:

            ```latex
            @misc{peng2024dfine,
                title={D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement},
                author={Yansong Peng and Hebei Li and Peixi Wu and Yueyi Zhang and Xiaoyan Sun and Feng Wu},
                year={2024},
                eprint={2410.13842},
                archivePrefix={arXiv},
                primaryClass={cs.CV}
            }
            ```
            """)

        demo.launch()


if __name__ == "__main__":
    app = DFineDemo()
    app.launch()