Spaces:
Sleeping
Sleeping
import logging | |
import os | |
import time | |
import numpy as np | |
import pandas as pd | |
from pathlib import Path | |
from dawsonia import io | |
from dawsonia import digitize | |
from dawsonia.ml import ml | |
import pooch | |
import gradio as gr | |
import yaml | |
from gradio_modal import Modal | |
logger = logging.getLogger(__name__) | |
# Max number of images a user can upload at once | |
MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5)) | |
# Setup the cache directory to point to the directory where the example images | |
# are located. The images must lay in the cache directory because otherwise they | |
# have to be reuploaded when drag-and-dropped to the input image widget. | |
GRADIO_CACHE = ".gradio_cache" | |
DATA_CACHE = os.path.join(GRADIO_CACHE, "data") | |
EXAMPLES_DIRECTORY = os.path.join(os.getcwd(), "examples") | |
# Example books | |
PIPELINES: dict[str, dict[str, str]] = { | |
"bjuröklubb": dict( | |
url="https://git.smhi.se/ai-for-obs/data/-/raw/688c04f13e8e946962792fe4b4e0ded98800b154/raw_zarr/BJUR%C3%96KLUBB/DAGBOK_Bjur%C3%B6klubb_Station_Jan-Dec_1928.zarr.zip", | |
known_hash="sha256:6d87b7f79836ae6373cfab11260fe28787d93fe16199fefede6697ccd750f71a", | |
) | |
} | |
if os.environ.get("GRADIO_CACHE_DIR", GRADIO_CACHE) != GRADIO_CACHE: | |
os.environ["GRADIO_CACHE_DIR"] = GRADIO_CACHE | |
logger.warning("Setting GRADIO_CACHE_DIR to '%s' (overriding a previous value).") | |
def run_dawsonia( | |
table_fmt_config_override, batch_image_gallery, book, progress=gr.Progress() | |
): | |
if None in (batch_image_gallery, book) or len(batch_image_gallery) == 0: | |
raise ValueError("You need to select / upload the pages to digitize") | |
progress(0, desc="Dawsonia: starting") | |
model_path = Path("data/models/dawsonia/2024-07-02") | |
output_path = Path(GRADIO_CACHE, "output") | |
print("Dawsonia: digitizing", book) | |
table_fmt = book.table_format | |
output_path_book = output_path / book.station_name / book._name | |
output_path_book.mkdir(exist_ok=True, parents=True) | |
(output_path_book / "probablities").mkdir(exist_ok=True) | |
init_data: list[dict[str, NDArray]] = [ | |
{ | |
key: np.empty(len(table_fmt.rows), dtype="O") | |
for key in table_fmt.columns[table_idx] | |
} | |
for table_idx in table_fmt.preproc.idx_tables_size_verify | |
] | |
for page_number in range(len(batch_image_gallery)): | |
output_path_page = output_path_book / str(page_number) | |
results = [ | |
digitize.digitize_page_and_write_output( | |
book, | |
init_data, | |
page_number=page_number + 3, | |
date_str="2022-02-02", | |
model_path=model_path, | |
model_predict=ml.model_predict, | |
prob_thresh=0.5, | |
output_path_page=output_path_page, | |
output_text_fmt=True, | |
debug=True, | |
) | |
] | |
collection = [] | |
time.sleep(1) | |
gr.Info("Pages were succesfully digitized ✨") | |
yield collection, gr.skip() | |
def all_example_images() -> list[str]: | |
""" | |
Get paths to all example images. | |
""" | |
examples = [ | |
os.path.join(EXAMPLES_DIRECTORY, f"{pipeline}.png") for pipeline in PIPELINES | |
] | |
return examples | |
def get_selected_example_image( | |
first_page, last_page, event: gr.SelectData | |
) -> tuple[str, io.Book] | None: | |
""" | |
Get the name of the pipeline that corresponds to the selected image. | |
""" | |
# for name, details in PIPELINES.items(): | |
name, _ext = event.value["image"]["orig_name"].split(".") | |
if name in PIPELINES: | |
book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE) | |
first, last, book = io.read_book(book_path) | |
book._name = name | |
book.size_cell = [1.0, 1.0, 1.0, 1.0] | |
return [book.read_image(pg) for pg in range(first_page, last_page)], book | |
table_fmt_config_override_placeholder = ( | |
"""\ | |
[default] | |
version = 0 | |
# Default values, but wrote explicitly here. See PreprocConfig class | |
[default.preproc] | |
table_modif = true | |
corr_rotate = true | |
row_idx_unit = "HOURS" | |
idx_tables_size_verify = [0, 1] | |
[version.0] | |
columns = [ | |
[ | |
"term_på_baro", | |
"barom", | |
"torra_term", | |
"våta_term", | |
"moln_slag_lägre", | |
"moln_mängd_lägre", | |
"moln_slag_medel", | |
"moln_slag_högre" | |
], | |
[ | |
"moln_het_sol_dimma_nederbörd_total", | |
"vind_riktning", | |
"vind_beaufort", | |
"vind_m_sek", | |
"sikt", | |
"sjögang", | |
"maximi_term", | |
"minimi_term", | |
"nederbörd_mängd", | |
"nederbörd_slag" | |
] | |
] | |
name_idx = "tid" | |
rows = [2, 8, 14, 19, 21] | |
tables = [ | |
[5, 8], | |
[5, 10], | |
[3, 1], | |
[4, 2], | |
[4, 5] | |
] | |
""", | |
) | |
with gr.Blocks() as submit: | |
gr.Markdown("# Upload") | |
gr.Markdown( | |
"Select or upload the image you want to transcribe. You can upload up to five images at a time." | |
) | |
batch_book_state = gr.State() | |
collection_submit_state = gr.State() | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(scale=5): | |
batch_image_gallery = gr.Gallery( | |
file_types=["image"], | |
label="Image to digitize", | |
interactive=True, | |
object_fit="scale-down", | |
scale=10, | |
) | |
with gr.Column(scale=2): | |
first_page = gr.Number(3, label="First page of the book", precision=0) | |
last_page = gr.Number(4, label="Last page of the book", precision=0) | |
examples = gr.Gallery( | |
all_example_images(), | |
label="Examples", | |
interactive=False, | |
allow_preview=False, | |
object_fit="scale-down", | |
min_width=250, | |
) | |
with Modal(visible=False) as edit_table_fmt_modal: | |
with gr.Column(): | |
gr.Markdown( | |
"## Table format configuration\n" | |
"Write a custom table format, overriding the default one. " | |
"Close [x] the popup when you are done." | |
) | |
table_fmt_config_override = gr.Code("", language="python") | |
gr.HTML( | |
( | |
"<a href='https://dawsonia.readthedocs.io/en/latest/user_guide/misc.html#table-formats' target='_blank'>" | |
"Read the docs for the table-formats spec" | |
"</a>. " | |
), | |
padding=False, | |
elem_classes="pipeline-help", | |
) | |
with gr.Row(): | |
run_button = gr.Button("Digitize", variant="primary", scale=0, min_width=200) | |
edit_table_fmt_button = gr.Button( | |
"Edit table format", variant="secondary", scale=0, min_width=200 | |
) | |
# All events interactions below | |
examples.select( | |
get_selected_example_image, | |
(first_page, last_page), | |
(batch_image_gallery, batch_book_state), | |
) | |
def validate_images(images): | |
if len(images) > MAX_IMAGES: | |
gr.Warning(f"Maximum images you can upload is set to: {MAX_IMAGES}") | |
return gr.update(value=None) | |
return images | |
run_button.click( | |
fn=run_dawsonia, | |
inputs=[table_fmt_config_override, batch_image_gallery, batch_book_state], | |
outputs=[collection_submit_state, batch_image_gallery], | |
) | |
edit_table_fmt_button.click(lambda: Modal(visible=True), None, edit_table_fmt_modal) | |