Spaces:
Running
Running
Ashwin V. Mohanan
commited on
Commit
·
d5ea0f1
1
Parent(s):
108c965
Sync on digitize and read from final output directory
Browse files- app/tabs/submit_functions.py +65 -27
- app/tabs/visualizer.py +1 -57
- app/tabs/visualizer_functions.py +62 -0
app/tabs/submit_functions.py
CHANGED
@@ -14,7 +14,7 @@ from numpy.typing import NDArray
|
|
14 |
import pandas as pd
|
15 |
import pooch
|
16 |
|
17 |
-
from .
|
18 |
|
19 |
# Max number of images a user can upload at once
|
20 |
MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
|
@@ -79,7 +79,7 @@ def run_dawsonia(
|
|
79 |
|
80 |
with warnings.catch_warnings():
|
81 |
warnings.simplefilter("ignore", FutureWarning)
|
82 |
-
for page_number
|
83 |
output_path_page = output_path_book / str(page_number)
|
84 |
gr.Info(f"Digitizing {page_number = }")
|
85 |
|
@@ -100,32 +100,63 @@ def run_dawsonia(
|
|
100 |
output_text_fmt=False,
|
101 |
debug=False,
|
102 |
)
|
|
|
|
|
103 |
progress_value = (page_number - first_page) / max(1, last_page - first_page)
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
)
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
123 |
gr.Info("Pages were succesfully digitized ✨")
|
124 |
|
125 |
# yield collection, images
|
126 |
yield collection, gr.skip()
|
127 |
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
def read_page(
|
130 |
output_path_book: Path,
|
131 |
prefix: str,
|
@@ -156,9 +187,12 @@ def read_page(
|
|
156 |
|
157 |
values_array = values_df.values.flatten()
|
158 |
prob_array = prob_df.values.flatten()
|
159 |
-
# FIXME: hardcoded. Use idx_tables_size_verify and reconstruct bbox_array
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
162 |
cells = [
|
163 |
make_cell(value, bbox)
|
164 |
for value, prob, bbox in zip(values_array, prob_array, bbox_array)
|
@@ -199,7 +233,9 @@ def get_selected_example_image(
|
|
199 |
station_tf = Path("table_formats", name).with_suffix(".toml")
|
200 |
|
201 |
if (last_page - first_page) > MAX_IMAGES:
|
202 |
-
|
|
|
|
|
203 |
|
204 |
if name in PIPELINES:
|
205 |
book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
|
@@ -214,6 +250,7 @@ def get_selected_example_image(
|
|
214 |
station_tf.read_text(),
|
215 |
)
|
216 |
|
|
|
217 |
def move_uploaded_file(uploaded, table_fmt_filename):
|
218 |
current_directory = Path(uploaded).parent
|
219 |
|
@@ -230,8 +267,9 @@ def move_uploaded_file(uploaded, table_fmt_filename):
|
|
230 |
print(f"Copy created", true_path)
|
231 |
return str(true_path)
|
232 |
|
|
|
233 |
def get_uploaded_image(
|
234 |
-
first_page:int, last_page:int, table_fmt_filename:str, filename: str
|
235 |
) -> tuple[list[NDArray], io.Book, str, str] | None:
|
236 |
|
237 |
name, _ext = filename.split(".")
|
@@ -248,7 +286,8 @@ def get_uploaded_image(
|
|
248 |
filename,
|
249 |
station_tf.read_text(),
|
250 |
)
|
251 |
-
|
|
|
252 |
def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
|
253 |
name = book.station_name
|
254 |
table_fmt_dir = Path("table_formats")
|
@@ -256,4 +295,3 @@ def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
|
|
256 |
book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
|
257 |
gr.Info(f"Overwritten table format file for {name}")
|
258 |
return book
|
259 |
-
|
|
|
14 |
import pandas as pd
|
15 |
import pooch
|
16 |
|
17 |
+
from .visualizer_functions import Page, TableCell
|
18 |
|
19 |
# Max number of images a user can upload at once
|
20 |
MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
|
|
|
79 |
|
80 |
with warnings.catch_warnings():
|
81 |
warnings.simplefilter("ignore", FutureWarning)
|
82 |
+
for page_number in range(first_page, last_page):
|
83 |
output_path_page = output_path_book / str(page_number)
|
84 |
gr.Info(f"Digitizing {page_number = }")
|
85 |
|
|
|
100 |
output_text_fmt=False,
|
101 |
debug=False,
|
102 |
)
|
103 |
+
_synctree(output_path_book, final_output_path_book)
|
104 |
+
|
105 |
progress_value = (page_number - first_page) / max(1, last_page - first_page)
|
106 |
|
107 |
+
# if final_output_path_book.exists():
|
108 |
+
# shutil.rmtree(final_output_path_book)
|
109 |
+
|
110 |
+
# shutil.copytree(output_path_book, final_output_path_book)
|
111 |
+
for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
|
112 |
+
if results := read_page(
|
113 |
+
final_output_path_book,
|
114 |
+
str(page_number),
|
115 |
+
prob_thresh,
|
116 |
+
progress,
|
117 |
+
1.0,
|
118 |
+
table_fmt.preproc.idx_tables_size_verify,
|
119 |
+
): # , im_from_gallery[0])
|
120 |
+
page, im = results
|
121 |
+
collection.append(page)
|
122 |
+
images.append(im)
|
123 |
+
yield collection, gr.skip()
|
124 |
+
else:
|
125 |
+
gr.Info(f"No tables detected in {page_number = }")
|
126 |
+
|
127 |
gr.Info("Pages were succesfully digitized ✨")
|
128 |
|
129 |
# yield collection, images
|
130 |
yield collection, gr.skip()
|
131 |
|
132 |
|
133 |
+
def _synctree(source_dir, dest_dir):
|
134 |
+
source_dir = Path(source_dir)
|
135 |
+
dest_dir = Path(dest_dir)
|
136 |
+
if not dest_dir.exists():
|
137 |
+
dest_dir.mkdir(parents=True)
|
138 |
+
|
139 |
+
for root, _, files in os.walk(source_dir):
|
140 |
+
root = Path(root)
|
141 |
+
relative_root = root.relative_to(source_dir)
|
142 |
+
|
143 |
+
# Create subdirectories in the destination directory
|
144 |
+
dest_subdir_path = dest_dir / relative_root
|
145 |
+
if not dest_subdir_path.exists():
|
146 |
+
dest_subdir_path.mkdir(parents=True, exist_ok=True)
|
147 |
+
|
148 |
+
for file_ in files:
|
149 |
+
source_file_path = root / file_
|
150 |
+
dest_file_path = dest_subdir_path / file_
|
151 |
+
|
152 |
+
# Copy only if the file does not already exist or is newer
|
153 |
+
if (
|
154 |
+
not dest_file_path.exists()
|
155 |
+
or (source_file_path.stat().st_mtime - dest_file_path.stat().st_mtime) > 0
|
156 |
+
):
|
157 |
+
shutil.copy2(source_file_path, dest_file_path)
|
158 |
+
|
159 |
+
|
160 |
def read_page(
|
161 |
output_path_book: Path,
|
162 |
prefix: str,
|
|
|
187 |
|
188 |
values_array = values_df.values.flatten()
|
189 |
prob_array = prob_df.values.flatten()
|
190 |
+
# FIXME: hardcoded to get upto 2 tables. Use idx_tables_size_verify and reconstruct bbox_array
|
191 |
+
try:
|
192 |
+
bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)
|
193 |
+
except ValueError:
|
194 |
+
bbox_array = np.reshape(table_meta["table_positions"][0], (-1, 4))
|
195 |
+
|
196 |
cells = [
|
197 |
make_cell(value, bbox)
|
198 |
for value, prob, bbox in zip(values_array, prob_array, bbox_array)
|
|
|
233 |
station_tf = Path("table_formats", name).with_suffix(".toml")
|
234 |
|
235 |
if (last_page - first_page) > MAX_IMAGES:
|
236 |
+
error = f"Maximum images you can digitize is set to: {MAX_IMAGES}"
|
237 |
+
gr.Warning(error)
|
238 |
+
raise ValueError(error)
|
239 |
|
240 |
if name in PIPELINES:
|
241 |
book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
|
|
|
250 |
station_tf.read_text(),
|
251 |
)
|
252 |
|
253 |
+
|
254 |
def move_uploaded_file(uploaded, table_fmt_filename):
|
255 |
current_directory = Path(uploaded).parent
|
256 |
|
|
|
267 |
print(f"Copy created", true_path)
|
268 |
return str(true_path)
|
269 |
|
270 |
+
|
271 |
def get_uploaded_image(
|
272 |
+
first_page: int, last_page: int, table_fmt_filename: str, filename: str
|
273 |
) -> tuple[list[NDArray], io.Book, str, str] | None:
|
274 |
|
275 |
name, _ext = filename.split(".")
|
|
|
286 |
filename,
|
287 |
station_tf.read_text(),
|
288 |
)
|
289 |
+
|
290 |
+
|
291 |
def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
|
292 |
name = book.station_name
|
293 |
table_fmt_dir = Path("table_formats")
|
|
|
295 |
book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
|
296 |
gr.Info(f"Overwritten table format file for {name}")
|
297 |
return book
|
|
app/tabs/visualizer.py
CHANGED
@@ -1,66 +1,10 @@
|
|
1 |
-
import os
|
2 |
import gradio as gr
|
3 |
from jinja2 import Environment, FileSystemLoader
|
4 |
-
from typing_extensions import TypeAlias
|
5 |
|
6 |
_ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
|
7 |
_IMAGE_TEMPLATE = _ENV.get_template("image.j2")
|
8 |
|
9 |
-
from
|
10 |
-
from dawsonia.typing import BBoxTuple
|
11 |
-
|
12 |
-
|
13 |
-
class TableCell(NamedTuple):
|
14 |
-
polygon: tuple[tuple[int, int], ...]
|
15 |
-
text_x: int
|
16 |
-
text_y: int
|
17 |
-
text: str
|
18 |
-
|
19 |
-
|
20 |
-
class Page(NamedTuple):
|
21 |
-
width: int
|
22 |
-
height: int
|
23 |
-
cells: list[TableCell]
|
24 |
-
path: str
|
25 |
-
|
26 |
-
|
27 |
-
Collection: TypeAlias = list[Page]
|
28 |
-
|
29 |
-
|
30 |
-
def render_image(collection: Collection, current_page_index: int) -> str:
|
31 |
-
return _IMAGE_TEMPLATE.render(
|
32 |
-
page=collection[current_page_index],
|
33 |
-
)
|
34 |
-
|
35 |
-
|
36 |
-
def toggle_navigation_button(collection: Collection):
|
37 |
-
visible = len(collection) > 1
|
38 |
-
return gr.update(visible=visible)
|
39 |
-
|
40 |
-
|
41 |
-
def activate_left_button(current_page_index):
|
42 |
-
interactive = current_page_index > 0
|
43 |
-
return gr.update(interactive=interactive)
|
44 |
-
|
45 |
-
|
46 |
-
def activate_right_button(collection: Collection, current_page_index):
|
47 |
-
interactive = current_page_index + 1 < len(collection)
|
48 |
-
return gr.update(interactive=interactive)
|
49 |
-
|
50 |
-
|
51 |
-
def right_button_click(collection: Collection, current_page_index):
|
52 |
-
max_index = len(collection) - 1
|
53 |
-
return min(max_index, current_page_index + 1)
|
54 |
-
|
55 |
-
|
56 |
-
def left_button_click(current_page_index):
|
57 |
-
return max(0, current_page_index - 1)
|
58 |
-
|
59 |
-
|
60 |
-
def update_image_caption(collection: Collection, current_page_index):
|
61 |
-
n_pages = len(collection)
|
62 |
-
label = os.path.split(collection[current_page_index].path)[-1]
|
63 |
-
return f"image {current_page_index + 1} of {n_pages}: `{label}`"
|
64 |
|
65 |
|
66 |
with gr.Blocks() as visualizer:
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from jinja2 import Environment, FileSystemLoader
|
|
|
3 |
|
4 |
_ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
|
5 |
_IMAGE_TEMPLATE = _ENV.get_template("image.j2")
|
6 |
|
7 |
+
from .visualizer_functions import render_image, toggle_navigation_button, activate_left_button, activate_right_button, right_button_click, left_button_click, update_image_caption
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
with gr.Blocks() as visualizer:
|
app/tabs/visualizer_functions.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
from jinja2 import Environment, FileSystemLoader
|
4 |
+
from typing_extensions import TypeAlias
|
5 |
+
|
6 |
+
_ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
|
7 |
+
_IMAGE_TEMPLATE = _ENV.get_template("image.j2")
|
8 |
+
|
9 |
+
from typing import NamedTuple, TypeAlias
|
10 |
+
|
11 |
+
|
12 |
+
class TableCell(NamedTuple):
|
13 |
+
polygon: tuple[tuple[int, int], ...]
|
14 |
+
text_x: int
|
15 |
+
text_y: int
|
16 |
+
text: str
|
17 |
+
|
18 |
+
|
19 |
+
class Page(NamedTuple):
|
20 |
+
width: int
|
21 |
+
height: int
|
22 |
+
cells: list[TableCell]
|
23 |
+
path: str
|
24 |
+
|
25 |
+
|
26 |
+
Collection: TypeAlias = list[Page]
|
27 |
+
|
28 |
+
|
29 |
+
def render_image(collection: Collection, current_page_index: int) -> str:
|
30 |
+
return _IMAGE_TEMPLATE.render(
|
31 |
+
page=collection[current_page_index],
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
def toggle_navigation_button(collection: Collection):
|
36 |
+
visible = len(collection) > 1
|
37 |
+
return gr.update(visible=visible)
|
38 |
+
|
39 |
+
|
40 |
+
def activate_left_button(current_page_index):
|
41 |
+
interactive = current_page_index > 0
|
42 |
+
return gr.update(interactive=interactive)
|
43 |
+
|
44 |
+
|
45 |
+
def activate_right_button(collection: Collection, current_page_index):
|
46 |
+
interactive = current_page_index + 1 < len(collection)
|
47 |
+
return gr.update(interactive=interactive)
|
48 |
+
|
49 |
+
|
50 |
+
def right_button_click(collection: Collection, current_page_index):
|
51 |
+
max_index = len(collection) - 1
|
52 |
+
return min(max_index, current_page_index + 1)
|
53 |
+
|
54 |
+
|
55 |
+
def left_button_click(current_page_index):
|
56 |
+
return max(0, current_page_index - 1)
|
57 |
+
|
58 |
+
|
59 |
+
def update_image_caption(collection: Collection, current_page_index):
|
60 |
+
n_pages = len(collection)
|
61 |
+
label = os.path.split(collection[current_page_index].path)[-1]
|
62 |
+
return f"image {current_page_index + 1} of {n_pages}: `{label}`"
|