Spaces:
Running
Running
Ashwin V. Mohanan
commited on
Commit
·
4433b8c
1
Parent(s):
ee45a15
Refactor event lister functions
Browse files- app/tabs/submit.py +1 -255
- app/tabs/submit_functions.py +259 -0
app/tabs/submit.py
CHANGED
@@ -1,251 +1,13 @@
|
|
1 |
-
import json
|
2 |
import logging
|
3 |
-
import os
|
4 |
from pathlib import Path
|
5 |
-
import shutil
|
6 |
-
import warnings
|
7 |
|
8 |
-
from PIL import Image
|
9 |
-
from dawsonia import io
|
10 |
-
from dawsonia import digitize
|
11 |
-
from dawsonia.ml import ml
|
12 |
-
from dawsonia.typing import Probability
|
13 |
import gradio as gr
|
14 |
from gradio_modal import Modal
|
15 |
-
import numpy as np
|
16 |
-
from numpy.typing import NDArray
|
17 |
-
import pandas as pd
|
18 |
-
import pooch
|
19 |
|
20 |
-
from .
|
21 |
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
24 |
-
# Max number of images a user can upload at once
|
25 |
-
MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
|
26 |
-
|
27 |
-
# Setup the cache directory to point to the directory where the example images
|
28 |
-
# are located. The images must lay in the cache directory because otherwise they
|
29 |
-
# have to be reuploaded when drag-and-dropped to the input image widget.
|
30 |
-
GRADIO_CACHE = os.getenv("GRADIO_CACHE_DIR", ".gradio_cache")
|
31 |
-
DATA_CACHE = os.path.join(GRADIO_CACHE, "data")
|
32 |
-
EXAMPLES_DIRECTORY = os.path.join(os.getcwd(), "examples")
|
33 |
-
|
34 |
-
# Example books
|
35 |
-
PIPELINES: dict[str, dict[str, str]] = {
|
36 |
-
"bjuröklubb": dict(
|
37 |
-
url="https://git.smhi.se/ai-for-obs/data/-/raw/688c04f13e8e946962792fe4b4e0ded98800b154/raw_zarr/BJUR%C3%96KLUBB/DAGBOK_Bjur%C3%B6klubb_Station_Jan-Dec_1928.zarr.zip",
|
38 |
-
known_hash="sha256:6d87b7f79836ae6373cfab11260fe28787d93fe16199fefede6697ccd750f71a",
|
39 |
-
),
|
40 |
-
"härnösand": dict(
|
41 |
-
url="https://git.smhi.se/ai-for-obs/data/-/raw/688c04f13e8e946962792fe4b4e0ded98800b154/raw_zarr/H%C3%84RN%C3%96SAND/DAGBOK_H%C3%A4rn%C3%B6sand_Station_1934.zarr.zip",
|
42 |
-
known_hash="sha256:a58fdb6521214d0bd569c9325ce78d696738de28ce6ec869cde0d46616b697f2",
|
43 |
-
),
|
44 |
-
}
|
45 |
-
|
46 |
-
|
47 |
-
def run_dawsonia(
|
48 |
-
table_fmt_config_override,
|
49 |
-
first_page,
|
50 |
-
last_page,
|
51 |
-
prob_thresh,
|
52 |
-
book: io.Book,
|
53 |
-
book_path,
|
54 |
-
gallery,
|
55 |
-
progress=gr.Progress(),
|
56 |
-
):
|
57 |
-
if book is None:
|
58 |
-
raise ValueError("You need to select / upload the pages to digitize")
|
59 |
-
|
60 |
-
progress(0, desc="Dawsonia: starting")
|
61 |
-
|
62 |
-
model_path = Path("data/models/dawsonia/2024-07-02")
|
63 |
-
output_path = Path("output")
|
64 |
-
output_path.mkdir(exist_ok=True)
|
65 |
-
|
66 |
-
print("Dawsonia: digitizing", book)
|
67 |
-
table_fmt = book.table_format
|
68 |
-
|
69 |
-
final_output_path_book = output_path / book.station_name
|
70 |
-
output_path_book = Path(book_path).parent / "output"
|
71 |
-
output_path_book.mkdir(exist_ok=True, parents=True)
|
72 |
-
(output_path_book / "probablities").mkdir(exist_ok=True)
|
73 |
-
|
74 |
-
init_data: list[dict[str, NDArray]] = [
|
75 |
-
{
|
76 |
-
key: np.empty(len(table_fmt.rows), dtype="O")
|
77 |
-
for key in table_fmt.columns[table_idx]
|
78 |
-
}
|
79 |
-
for table_idx in table_fmt.preproc.idx_tables_size_verify
|
80 |
-
]
|
81 |
-
|
82 |
-
collection = []
|
83 |
-
images = []
|
84 |
-
|
85 |
-
with warnings.catch_warnings():
|
86 |
-
warnings.simplefilter("ignore", FutureWarning)
|
87 |
-
for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
|
88 |
-
output_path_page = output_path_book / str(page_number)
|
89 |
-
gr.Info(f"Digitizing {page_number = }")
|
90 |
-
|
91 |
-
if (
|
92 |
-
not (output_path_book / str(page_number))
|
93 |
-
.with_suffix(".parquet")
|
94 |
-
.exists()
|
95 |
-
):
|
96 |
-
digitize.digitize_page_and_write_output(
|
97 |
-
book,
|
98 |
-
init_data,
|
99 |
-
page_number=page_number,
|
100 |
-
date_str=f"0000-page-{page_number}",
|
101 |
-
model_path=model_path,
|
102 |
-
model_predict=ml.model_predict,
|
103 |
-
prob_thresh=prob_thresh,
|
104 |
-
output_path_page=output_path_page,
|
105 |
-
output_text_fmt=False,
|
106 |
-
debug=False,
|
107 |
-
)
|
108 |
-
progress_value = (page_number - first_page) / max(1, last_page - first_page)
|
109 |
-
|
110 |
-
if results := read_page(
|
111 |
-
output_path_book,
|
112 |
-
str(page_number),
|
113 |
-
prob_thresh,
|
114 |
-
progress,
|
115 |
-
progress_value,
|
116 |
-
table_fmt.preproc.idx_tables_size_verify,
|
117 |
-
): # , im_from_gallery[0])
|
118 |
-
page, im = results
|
119 |
-
collection.append(page)
|
120 |
-
images.append(im)
|
121 |
-
else:
|
122 |
-
gr.Info(f"No tables detected in {page_number = }")
|
123 |
-
|
124 |
-
if final_output_path_book.exists():
|
125 |
-
shutil.rmtree(final_output_path_book)
|
126 |
-
|
127 |
-
shutil.copytree(output_path_book, final_output_path_book)
|
128 |
-
gr.Info("Pages were succesfully digitized ✨")
|
129 |
-
|
130 |
-
# yield collection, images
|
131 |
-
yield collection, gr.skip()
|
132 |
-
|
133 |
-
|
134 |
-
def read_page(
|
135 |
-
output_path_book: Path,
|
136 |
-
prefix: str,
|
137 |
-
prob_thresh: float,
|
138 |
-
progress,
|
139 |
-
progress_value,
|
140 |
-
idx_tables_size_verify: list[int],
|
141 |
-
im_path_from_gallery: str = "",
|
142 |
-
):
|
143 |
-
stats = digitize.Statistics.from_json(
|
144 |
-
(output_path_book / "statistics" / prefix).with_suffix(".json")
|
145 |
-
)
|
146 |
-
print(stats)
|
147 |
-
progress(progress_value, desc=f"Dawsonia: {stats!s:.50}")
|
148 |
-
if stats.tables_detected > 0:
|
149 |
-
values_df = pd.read_parquet((output_path_book / prefix).with_suffix(".parquet"))
|
150 |
-
prob_df = pd.read_parquet(
|
151 |
-
(output_path_book / "probablities" / prefix).with_suffix(".parquet")
|
152 |
-
)
|
153 |
-
table_meta = json.loads(
|
154 |
-
(output_path_book / "table_meta" / prefix).with_suffix(".json").read_text()
|
155 |
-
)
|
156 |
-
with Image.open(
|
157 |
-
image_path := (output_path_book / "pages" / prefix).with_suffix(".webp")
|
158 |
-
) as im:
|
159 |
-
width = im.width
|
160 |
-
height = im.height
|
161 |
-
|
162 |
-
values_array = values_df.values.flatten()
|
163 |
-
prob_array = prob_df.values.flatten()
|
164 |
-
# FIXME: hardcoded. Use idx_tables_size_verify and reconstruct bbox_array
|
165 |
-
bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)\
|
166 |
-
|
167 |
-
cells = [
|
168 |
-
make_cell(value, bbox)
|
169 |
-
for value, prob, bbox in zip(values_array, prob_array, bbox_array)
|
170 |
-
if prob > prob_thresh
|
171 |
-
]
|
172 |
-
|
173 |
-
return Page(width, height, cells, im_path_from_gallery or str(image_path)), im
|
174 |
-
|
175 |
-
|
176 |
-
def make_cell(value: str, bbox: NDArray[np.int64]):
|
177 |
-
y, x, h, w = bbox
|
178 |
-
xmin, ymin = x - w // 2, y - h // 2
|
179 |
-
xmax, ymax = x + w // 2, y + h // 2
|
180 |
-
polygon = (xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax), (xmin, ymin)
|
181 |
-
return TableCell(polygon, text_x=x - w // 4, text_y=y, text=value)
|
182 |
-
|
183 |
-
|
184 |
-
def all_example_images() -> list[str]:
|
185 |
-
"""
|
186 |
-
Get paths to all example images.
|
187 |
-
"""
|
188 |
-
examples = [
|
189 |
-
os.path.join(EXAMPLES_DIRECTORY, f"{pipeline}.png") for pipeline in PIPELINES
|
190 |
-
]
|
191 |
-
return examples
|
192 |
-
|
193 |
-
|
194 |
-
def get_selected_example_image(
|
195 |
-
first_page, last_page, event: gr.SelectData
|
196 |
-
) -> tuple[list[Image.Image], io.Book, str, str, str] | None:
|
197 |
-
"""
|
198 |
-
Get the name of the pipeline that corresponds to the selected image.
|
199 |
-
"""
|
200 |
-
orig_name = event.value["image"]["orig_name"]
|
201 |
-
# for name, details in PIPELINES.items():
|
202 |
-
name, _ext = orig_name.split(".")
|
203 |
-
|
204 |
-
station_tf = Path("table_formats", name).with_suffix(".toml")
|
205 |
-
|
206 |
-
if (last_page - first_page) > MAX_IMAGES:
|
207 |
-
raise ValueError(f"Maximum images you can digitize is set to: {MAX_IMAGES}")
|
208 |
-
|
209 |
-
if name in PIPELINES:
|
210 |
-
book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
|
211 |
-
first, last, book = io.read_book(book_path)
|
212 |
-
book._name = name
|
213 |
-
book.size_cell = [1.0, 1.0, 1.0, 1.0]
|
214 |
-
return (
|
215 |
-
[book.read_image(pg) for pg in range(first_page, last_page)],
|
216 |
-
book,
|
217 |
-
book_path,
|
218 |
-
station_tf.name,
|
219 |
-
station_tf.read_text(),
|
220 |
-
)
|
221 |
-
|
222 |
-
def get_uploaded_image(
|
223 |
-
first_page:int, last_page:int, table_fmt_filename:str, filename: str
|
224 |
-
) -> tuple[list[NDArray], io.Book, str, str] | None:
|
225 |
-
|
226 |
-
name, _ext = filename.split(".")
|
227 |
-
station_tf = Path("table_formats", table_fmt_filename)
|
228 |
-
if not station_tf.exists():
|
229 |
-
station_tf = Path("table_formats", "bjuröklubb.toml")
|
230 |
-
|
231 |
-
first, last, book = io.read_book(Path(filename))
|
232 |
-
book._name = name
|
233 |
-
book.size_cell = [1.0, 1.0, 1.0, 1.0]
|
234 |
-
return (
|
235 |
-
[book.read_page(pg) for pg in range(first_page, last_page)],
|
236 |
-
book,
|
237 |
-
filename,
|
238 |
-
station_tf.read_text(),
|
239 |
-
)
|
240 |
-
|
241 |
-
def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
|
242 |
-
name = book.station_name
|
243 |
-
table_fmt_dir = Path("table_formats")
|
244 |
-
(table_fmt_dir / name).with_suffix(".toml").write_text(table_fmt)
|
245 |
-
book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
|
246 |
-
gr.Info(f"Overwritten table format file for {name}")
|
247 |
-
return book
|
248 |
-
|
249 |
|
250 |
with gr.Blocks() as submit:
|
251 |
gr.Markdown(
|
@@ -293,22 +55,6 @@ with gr.Blocks() as submit:
|
|
293 |
|
294 |
# upload_file_true_path = gr.Textbox(visible=False)
|
295 |
|
296 |
-
def move_uploaded_file(uploaded, table_fmt_filename):
|
297 |
-
current_directory = Path(uploaded).parent
|
298 |
-
|
299 |
-
# Define the target directory where you want to save the uploaded files
|
300 |
-
target_directory = current_directory / table_fmt_filename.removesuffix(".toml")
|
301 |
-
os.makedirs(target_directory, exist_ok=True)
|
302 |
-
|
303 |
-
# Move the uploaded file to the target directory
|
304 |
-
true_path = Path(target_directory / Path(uploaded).name)
|
305 |
-
# if true_path.exists():
|
306 |
-
# true_path.unlink()
|
307 |
-
|
308 |
-
shutil.copy2(uploaded, true_path)
|
309 |
-
print(f"Copy created", true_path)
|
310 |
-
return str(true_path)
|
311 |
-
|
312 |
upload_button = gr.Button(value="Upload", min_width=200)
|
313 |
|
314 |
with Modal(visible=False) as edit_table_fmt_modal:
|
|
|
|
|
1 |
import logging
|
|
|
2 |
from pathlib import Path
|
|
|
|
|
3 |
|
|
|
|
|
|
|
|
|
|
|
4 |
import gradio as gr
|
5 |
from gradio_modal import Modal
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
from .submit_functions import all_example_images, get_selected_example_image, move_uploaded_file, get_uploaded_image, run_dawsonia, overwrite_table_format_file
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
with gr.Blocks() as submit:
|
13 |
gr.Markdown(
|
|
|
55 |
|
56 |
# upload_file_true_path = gr.Textbox(visible=False)
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
upload_button = gr.Button(value="Upload", min_width=200)
|
59 |
|
60 |
with Modal(visible=False) as edit_table_fmt_modal:
|
app/tabs/submit_functions.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
import shutil
|
5 |
+
import warnings
|
6 |
+
|
7 |
+
from PIL import Image
|
8 |
+
from dawsonia import io
|
9 |
+
from dawsonia import digitize
|
10 |
+
from dawsonia.ml import ml
|
11 |
+
import gradio as gr
|
12 |
+
import numpy as np
|
13 |
+
from numpy.typing import NDArray
|
14 |
+
import pandas as pd
|
15 |
+
import pooch
|
16 |
+
|
17 |
+
from .visualizer import Page, TableCell
|
18 |
+
|
19 |
+
# Max number of images a user can upload at once
|
20 |
+
MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
|
21 |
+
|
22 |
+
# Setup the cache directory to point to the directory where the example images
|
23 |
+
# are located. The images must lay in the cache directory because otherwise they
|
24 |
+
# have to be reuploaded when drag-and-dropped to the input image widget.
|
25 |
+
GRADIO_CACHE = os.getenv("GRADIO_CACHE_DIR", ".gradio_cache")
|
26 |
+
DATA_CACHE = os.path.join(GRADIO_CACHE, "data")
|
27 |
+
EXAMPLES_DIRECTORY = os.path.join(os.getcwd(), "examples")
|
28 |
+
|
29 |
+
# Example books
|
30 |
+
PIPELINES: dict[str, dict[str, str]] = {
|
31 |
+
"bjuröklubb": dict(
|
32 |
+
url="https://git.smhi.se/ai-for-obs/data/-/raw/688c04f13e8e946962792fe4b4e0ded98800b154/raw_zarr/BJUR%C3%96KLUBB/DAGBOK_Bjur%C3%B6klubb_Station_Jan-Dec_1928.zarr.zip",
|
33 |
+
known_hash="sha256:6d87b7f79836ae6373cfab11260fe28787d93fe16199fefede6697ccd750f71a",
|
34 |
+
),
|
35 |
+
"härnösand": dict(
|
36 |
+
url="https://git.smhi.se/ai-for-obs/data/-/raw/688c04f13e8e946962792fe4b4e0ded98800b154/raw_zarr/H%C3%84RN%C3%96SAND/DAGBOK_H%C3%A4rn%C3%B6sand_Station_1934.zarr.zip",
|
37 |
+
known_hash="sha256:a58fdb6521214d0bd569c9325ce78d696738de28ce6ec869cde0d46616b697f2",
|
38 |
+
),
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
def run_dawsonia(
|
43 |
+
table_fmt_config_override,
|
44 |
+
first_page,
|
45 |
+
last_page,
|
46 |
+
prob_thresh,
|
47 |
+
book: io.Book,
|
48 |
+
book_path,
|
49 |
+
gallery,
|
50 |
+
progress=gr.Progress(),
|
51 |
+
):
|
52 |
+
if book is None:
|
53 |
+
raise ValueError("You need to select / upload the pages to digitize")
|
54 |
+
|
55 |
+
progress(0, desc="Dawsonia: starting")
|
56 |
+
|
57 |
+
model_path = Path("data/models/dawsonia/2024-07-02")
|
58 |
+
output_path = Path("output")
|
59 |
+
output_path.mkdir(exist_ok=True)
|
60 |
+
|
61 |
+
print("Dawsonia: digitizing", book)
|
62 |
+
table_fmt = book.table_format
|
63 |
+
|
64 |
+
final_output_path_book = output_path / book.station_name
|
65 |
+
output_path_book = Path(book_path).parent / "output"
|
66 |
+
output_path_book.mkdir(exist_ok=True, parents=True)
|
67 |
+
(output_path_book / "probablities").mkdir(exist_ok=True)
|
68 |
+
|
69 |
+
init_data: list[dict[str, NDArray]] = [
|
70 |
+
{
|
71 |
+
key: np.empty(len(table_fmt.rows), dtype="O")
|
72 |
+
for key in table_fmt.columns[table_idx]
|
73 |
+
}
|
74 |
+
for table_idx in table_fmt.preproc.idx_tables_size_verify
|
75 |
+
]
|
76 |
+
|
77 |
+
collection = []
|
78 |
+
images = []
|
79 |
+
|
80 |
+
with warnings.catch_warnings():
|
81 |
+
warnings.simplefilter("ignore", FutureWarning)
|
82 |
+
for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
|
83 |
+
output_path_page = output_path_book / str(page_number)
|
84 |
+
gr.Info(f"Digitizing {page_number = }")
|
85 |
+
|
86 |
+
if (
|
87 |
+
not (output_path_book / str(page_number))
|
88 |
+
.with_suffix(".parquet")
|
89 |
+
.exists()
|
90 |
+
):
|
91 |
+
digitize.digitize_page_and_write_output(
|
92 |
+
book,
|
93 |
+
init_data,
|
94 |
+
page_number=page_number,
|
95 |
+
date_str=f"0000-page-{page_number}",
|
96 |
+
model_path=model_path,
|
97 |
+
model_predict=ml.model_predict,
|
98 |
+
prob_thresh=prob_thresh,
|
99 |
+
output_path_page=output_path_page,
|
100 |
+
output_text_fmt=False,
|
101 |
+
debug=False,
|
102 |
+
)
|
103 |
+
progress_value = (page_number - first_page) / max(1, last_page - first_page)
|
104 |
+
|
105 |
+
if results := read_page(
|
106 |
+
output_path_book,
|
107 |
+
str(page_number),
|
108 |
+
prob_thresh,
|
109 |
+
progress,
|
110 |
+
progress_value,
|
111 |
+
table_fmt.preproc.idx_tables_size_verify,
|
112 |
+
): # , im_from_gallery[0])
|
113 |
+
page, im = results
|
114 |
+
collection.append(page)
|
115 |
+
images.append(im)
|
116 |
+
else:
|
117 |
+
gr.Info(f"No tables detected in {page_number = }")
|
118 |
+
|
119 |
+
if final_output_path_book.exists():
|
120 |
+
shutil.rmtree(final_output_path_book)
|
121 |
+
|
122 |
+
shutil.copytree(output_path_book, final_output_path_book)
|
123 |
+
gr.Info("Pages were succesfully digitized ✨")
|
124 |
+
|
125 |
+
# yield collection, images
|
126 |
+
yield collection, gr.skip()
|
127 |
+
|
128 |
+
|
129 |
+
def read_page(
|
130 |
+
output_path_book: Path,
|
131 |
+
prefix: str,
|
132 |
+
prob_thresh: float,
|
133 |
+
progress,
|
134 |
+
progress_value,
|
135 |
+
idx_tables_size_verify: list[int],
|
136 |
+
im_path_from_gallery: str = "",
|
137 |
+
):
|
138 |
+
stats = digitize.Statistics.from_json(
|
139 |
+
(output_path_book / "statistics" / prefix).with_suffix(".json")
|
140 |
+
)
|
141 |
+
print(stats)
|
142 |
+
progress(progress_value, desc=f"Dawsonia: {stats!s:.50}")
|
143 |
+
if stats.tables_detected > 0:
|
144 |
+
values_df = pd.read_parquet((output_path_book / prefix).with_suffix(".parquet"))
|
145 |
+
prob_df = pd.read_parquet(
|
146 |
+
(output_path_book / "probablities" / prefix).with_suffix(".parquet")
|
147 |
+
)
|
148 |
+
table_meta = json.loads(
|
149 |
+
(output_path_book / "table_meta" / prefix).with_suffix(".json").read_text()
|
150 |
+
)
|
151 |
+
with Image.open(
|
152 |
+
image_path := (output_path_book / "pages" / prefix).with_suffix(".webp")
|
153 |
+
) as im:
|
154 |
+
width = im.width
|
155 |
+
height = im.height
|
156 |
+
|
157 |
+
values_array = values_df.values.flatten()
|
158 |
+
prob_array = prob_df.values.flatten()
|
159 |
+
# FIXME: hardcoded. Use idx_tables_size_verify and reconstruct bbox_array
|
160 |
+
bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)\
|
161 |
+
|
162 |
+
cells = [
|
163 |
+
make_cell(value, bbox)
|
164 |
+
for value, prob, bbox in zip(values_array, prob_array, bbox_array)
|
165 |
+
if prob > prob_thresh
|
166 |
+
]
|
167 |
+
|
168 |
+
return Page(width, height, cells, im_path_from_gallery or str(image_path)), im
|
169 |
+
|
170 |
+
|
171 |
+
def make_cell(value: str, bbox: NDArray[np.int64]):
|
172 |
+
y, x, h, w = bbox
|
173 |
+
xmin, ymin = x - w // 2, y - h // 2
|
174 |
+
xmax, ymax = x + w // 2, y + h // 2
|
175 |
+
polygon = (xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax), (xmin, ymin)
|
176 |
+
return TableCell(polygon, text_x=x - w // 4, text_y=y, text=value)
|
177 |
+
|
178 |
+
|
179 |
+
def all_example_images() -> list[str]:
|
180 |
+
"""
|
181 |
+
Get paths to all example images.
|
182 |
+
"""
|
183 |
+
examples = [
|
184 |
+
os.path.join(EXAMPLES_DIRECTORY, f"{pipeline}.png") for pipeline in PIPELINES
|
185 |
+
]
|
186 |
+
return examples
|
187 |
+
|
188 |
+
|
189 |
+
def get_selected_example_image(
|
190 |
+
first_page, last_page, event: gr.SelectData
|
191 |
+
) -> tuple[list[Image.Image], io.Book, str, str, str] | None:
|
192 |
+
"""
|
193 |
+
Get the name of the pipeline that corresponds to the selected image.
|
194 |
+
"""
|
195 |
+
orig_name = event.value["image"]["orig_name"]
|
196 |
+
# for name, details in PIPELINES.items():
|
197 |
+
name, _ext = orig_name.split(".")
|
198 |
+
|
199 |
+
station_tf = Path("table_formats", name).with_suffix(".toml")
|
200 |
+
|
201 |
+
if (last_page - first_page) > MAX_IMAGES:
|
202 |
+
raise ValueError(f"Maximum images you can digitize is set to: {MAX_IMAGES}")
|
203 |
+
|
204 |
+
if name in PIPELINES:
|
205 |
+
book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
|
206 |
+
first, last, book = io.read_book(book_path)
|
207 |
+
book._name = name
|
208 |
+
book.size_cell = [1.0, 1.0, 1.0, 1.0]
|
209 |
+
return (
|
210 |
+
[book.read_image(pg) for pg in range(first_page, last_page)],
|
211 |
+
book,
|
212 |
+
book_path,
|
213 |
+
station_tf.name,
|
214 |
+
station_tf.read_text(),
|
215 |
+
)
|
216 |
+
|
217 |
+
def move_uploaded_file(uploaded, table_fmt_filename):
|
218 |
+
current_directory = Path(uploaded).parent
|
219 |
+
|
220 |
+
# Define the target directory where you want to save the uploaded files
|
221 |
+
target_directory = current_directory / table_fmt_filename.removesuffix(".toml")
|
222 |
+
os.makedirs(target_directory, exist_ok=True)
|
223 |
+
|
224 |
+
# Move the uploaded file to the target directory
|
225 |
+
true_path = Path(target_directory / Path(uploaded).name)
|
226 |
+
# if true_path.exists():
|
227 |
+
# true_path.unlink()
|
228 |
+
|
229 |
+
shutil.copy2(uploaded, true_path)
|
230 |
+
print(f"Copy created", true_path)
|
231 |
+
return str(true_path)
|
232 |
+
|
233 |
+
def get_uploaded_image(
|
234 |
+
first_page:int, last_page:int, table_fmt_filename:str, filename: str
|
235 |
+
) -> tuple[list[NDArray], io.Book, str, str] | None:
|
236 |
+
|
237 |
+
name, _ext = filename.split(".")
|
238 |
+
station_tf = Path("table_formats", table_fmt_filename)
|
239 |
+
if not station_tf.exists():
|
240 |
+
station_tf = Path("table_formats", "bjuröklubb.toml")
|
241 |
+
|
242 |
+
first, last, book = io.read_book(Path(filename))
|
243 |
+
book._name = name
|
244 |
+
book.size_cell = [1.0, 1.0, 1.0, 1.0]
|
245 |
+
return (
|
246 |
+
[book.read_page(pg) for pg in range(first_page, last_page)],
|
247 |
+
book,
|
248 |
+
filename,
|
249 |
+
station_tf.read_text(),
|
250 |
+
)
|
251 |
+
|
252 |
+
def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
|
253 |
+
name = book.station_name
|
254 |
+
table_fmt_dir = Path("table_formats")
|
255 |
+
(table_fmt_dir / name).with_suffix(".toml").write_text(table_fmt)
|
256 |
+
book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
|
257 |
+
gr.Info(f"Overwritten table format file for {name}")
|
258 |
+
return book
|
259 |
+
|