|
--- |
|
license: apache-2.0 |
|
--- |
|
|
|
THIS IS WORK IN PROGRESS |
|
|
|
|
|
# Docling Layout Model |
|
|
|
`docling-layout-heron` is the Layout Model of [Docling project](https://github.com/docling-project/docling). |
|
|
|
This model uses the [RT-DETRv2](https://github.com/lyuwenyu/RT-DETR/tree/main/rtdetrv2_pytorch) architecture and has been trained from scratch on a variety of document datasets. |
|
|
|
|
|
# Inference code example |
|
|
|
Prerequisites: |
|
|
|
```bash |
|
pip install transformers Pillow torch requests |
|
``` |
|
|
|
Prediction: |
|
|
|
```python |
|
import requests |
|
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor |
|
import torch |
|
from PIL import Image |
|
|
|
|
|
classes_map = { |
|
0: "Caption", |
|
1: "Footnote", |
|
2: "Formula", |
|
3: "List-item", |
|
4: "Page-footer", |
|
5: "Page-header", |
|
6: "Picture", |
|
7: "Section-header", |
|
8: "Table", |
|
9: "Text", |
|
10: "Title", |
|
11: "Document Index", |
|
12: "Code", |
|
13: "Checkbox-Selected", |
|
14: "Checkbox-Unselected", |
|
15: "Form", |
|
16: "Key-Value Region", |
|
} |
|
image_url = "https://huggingface.co/spaces/ds4sd/SmolDocling-256M-Demo/resolve/main/example_images/annual_rep_14.png" |
|
model_name = "ds4sd/docling-layout-heron" |
|
threshold = 0.6 |
|
|
|
|
|
# Download the image |
|
image = Image.open(requests.get(image_url, stream=True).raw) |
|
image = image.convert("RGB") |
|
|
|
# Initialize the model |
|
image_processor = RTDetrImageProcessor.from_pretrained(model_name) |
|
model = RTDetrV2ForObjectDetection.from_pretrained(model_name) |
|
|
|
# Run the prediction pipeline |
|
inputs = image_processor(images=[image], return_tensors="pt") |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
results = image_processor.post_process_object_detection( |
|
outputs, |
|
target_sizes=torch.tensor([image.size[::-1]]), |
|
threshold=threshold, |
|
) |
|
|
|
# Get the results |
|
for result in results: |
|
for score, label_id, box in zip( |
|
result["scores"], result["labels"], result["boxes"] |
|
): |
|
score = round(score.item(), 2) |
|
label = classes_map[label_id.item()] |
|
box = [round(i, 2) for i in box.tolist()] |
|
print(f"{label}:{score} {box}") |
|
``` |
|
|
|
|
|
# References |
|
|
|
``` |
|
@techreport{Docling, |
|
author = {Deep Search Team}, |
|
month = {8}, |
|
title = {Docling Technical Report}, |
|
url = {https://arxiv.org/abs/2408.09869v4}, |
|
eprint = {2408.09869}, |
|
doi = {10.48550/arXiv.2408.09869}, |
|
version = {1.0.0}, |
|
year = {2024} |
|
} |
|
|
|
@misc{lv2024rtdetrv2improvedbaselinebagoffreebies, |
|
title={RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer}, |
|
author={Wenyu Lv and Yian Zhao and Qinyao Chang and Kui Huang and Guanzhong Wang and Yi Liu}, |
|
year={2024}, |
|
eprint={2407.17140}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CV}, |
|
url={https://arxiv.org/abs/2407.17140}, |
|
} |
|
|
|
``` |