Ashwin V. Mohanan commited on
Commit
d5ea0f1
·
1 Parent(s): 108c965

Sync on digitize and read from final output directory

Browse files
app/tabs/submit_functions.py CHANGED
@@ -14,7 +14,7 @@ from numpy.typing import NDArray
14
  import pandas as pd
15
  import pooch
16
 
17
- from .visualizer import Page, TableCell
18
 
19
  # Max number of images a user can upload at once
20
  MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
@@ -79,7 +79,7 @@ def run_dawsonia(
79
 
80
  with warnings.catch_warnings():
81
  warnings.simplefilter("ignore", FutureWarning)
82
- for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
83
  output_path_page = output_path_book / str(page_number)
84
  gr.Info(f"Digitizing {page_number = }")
85
 
@@ -100,32 +100,63 @@ def run_dawsonia(
100
  output_text_fmt=False,
101
  debug=False,
102
  )
 
 
103
  progress_value = (page_number - first_page) / max(1, last_page - first_page)
104
 
105
- if results := read_page(
106
- output_path_book,
107
- str(page_number),
108
- prob_thresh,
109
- progress,
110
- progress_value,
111
- table_fmt.preproc.idx_tables_size_verify,
112
- ): # , im_from_gallery[0])
113
- page, im = results
114
- collection.append(page)
115
- images.append(im)
116
- else:
117
- gr.Info(f"No tables detected in {page_number = }")
118
-
119
- if final_output_path_book.exists():
120
- shutil.rmtree(final_output_path_book)
121
-
122
- shutil.copytree(output_path_book, final_output_path_book)
 
 
123
  gr.Info("Pages were succesfully digitized ✨")
124
 
125
  # yield collection, images
126
  yield collection, gr.skip()
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def read_page(
130
  output_path_book: Path,
131
  prefix: str,
@@ -156,9 +187,12 @@ def read_page(
156
 
157
  values_array = values_df.values.flatten()
158
  prob_array = prob_df.values.flatten()
159
- # FIXME: hardcoded. Use idx_tables_size_verify and reconstruct bbox_array
160
- bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)\
161
-
 
 
 
162
  cells = [
163
  make_cell(value, bbox)
164
  for value, prob, bbox in zip(values_array, prob_array, bbox_array)
@@ -199,7 +233,9 @@ def get_selected_example_image(
199
  station_tf = Path("table_formats", name).with_suffix(".toml")
200
 
201
  if (last_page - first_page) > MAX_IMAGES:
202
- raise ValueError(f"Maximum images you can digitize is set to: {MAX_IMAGES}")
 
 
203
 
204
  if name in PIPELINES:
205
  book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
@@ -214,6 +250,7 @@ def get_selected_example_image(
214
  station_tf.read_text(),
215
  )
216
 
 
217
  def move_uploaded_file(uploaded, table_fmt_filename):
218
  current_directory = Path(uploaded).parent
219
 
@@ -230,8 +267,9 @@ def move_uploaded_file(uploaded, table_fmt_filename):
230
  print(f"Copy created", true_path)
231
  return str(true_path)
232
 
 
233
  def get_uploaded_image(
234
- first_page:int, last_page:int, table_fmt_filename:str, filename: str
235
  ) -> tuple[list[NDArray], io.Book, str, str] | None:
236
 
237
  name, _ext = filename.split(".")
@@ -248,7 +286,8 @@ def get_uploaded_image(
248
  filename,
249
  station_tf.read_text(),
250
  )
251
-
 
252
  def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
253
  name = book.station_name
254
  table_fmt_dir = Path("table_formats")
@@ -256,4 +295,3 @@ def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
256
  book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
257
  gr.Info(f"Overwritten table format file for {name}")
258
  return book
259
-
 
14
  import pandas as pd
15
  import pooch
16
 
17
+ from .visualizer_functions import Page, TableCell
18
 
19
  # Max number of images a user can upload at once
20
  MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
 
79
 
80
  with warnings.catch_warnings():
81
  warnings.simplefilter("ignore", FutureWarning)
82
+ for page_number in range(first_page, last_page):
83
  output_path_page = output_path_book / str(page_number)
84
  gr.Info(f"Digitizing {page_number = }")
85
 
 
100
  output_text_fmt=False,
101
  debug=False,
102
  )
103
+ _synctree(output_path_book, final_output_path_book)
104
+
105
  progress_value = (page_number - first_page) / max(1, last_page - first_page)
106
 
107
+ # if final_output_path_book.exists():
108
+ # shutil.rmtree(final_output_path_book)
109
+
110
+ # shutil.copytree(output_path_book, final_output_path_book)
111
+ for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
112
+ if results := read_page(
113
+ final_output_path_book,
114
+ str(page_number),
115
+ prob_thresh,
116
+ progress,
117
+ 1.0,
118
+ table_fmt.preproc.idx_tables_size_verify,
119
+ ): # , im_from_gallery[0])
120
+ page, im = results
121
+ collection.append(page)
122
+ images.append(im)
123
+ yield collection, gr.skip()
124
+ else:
125
+ gr.Info(f"No tables detected in {page_number = }")
126
+
127
  gr.Info("Pages were succesfully digitized ✨")
128
 
129
  # yield collection, images
130
  yield collection, gr.skip()
131
 
132
 
133
+ def _synctree(source_dir, dest_dir):
134
+ source_dir = Path(source_dir)
135
+ dest_dir = Path(dest_dir)
136
+ if not dest_dir.exists():
137
+ dest_dir.mkdir(parents=True)
138
+
139
+ for root, _, files in os.walk(source_dir):
140
+ root = Path(root)
141
+ relative_root = root.relative_to(source_dir)
142
+
143
+ # Create subdirectories in the destination directory
144
+ dest_subdir_path = dest_dir / relative_root
145
+ if not dest_subdir_path.exists():
146
+ dest_subdir_path.mkdir(parents=True, exist_ok=True)
147
+
148
+ for file_ in files:
149
+ source_file_path = root / file_
150
+ dest_file_path = dest_subdir_path / file_
151
+
152
+ # Copy only if the file does not already exist or is newer
153
+ if (
154
+ not dest_file_path.exists()
155
+ or (source_file_path.stat().st_mtime - dest_file_path.stat().st_mtime) > 0
156
+ ):
157
+ shutil.copy2(source_file_path, dest_file_path)
158
+
159
+
160
  def read_page(
161
  output_path_book: Path,
162
  prefix: str,
 
187
 
188
  values_array = values_df.values.flatten()
189
  prob_array = prob_df.values.flatten()
190
+ # FIXME: hardcoded to get upto 2 tables. Use idx_tables_size_verify and reconstruct bbox_array
191
+ try:
192
+ bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)
193
+ except ValueError:
194
+ bbox_array = np.reshape(table_meta["table_positions"][0], (-1, 4))
195
+
196
  cells = [
197
  make_cell(value, bbox)
198
  for value, prob, bbox in zip(values_array, prob_array, bbox_array)
 
233
  station_tf = Path("table_formats", name).with_suffix(".toml")
234
 
235
  if (last_page - first_page) > MAX_IMAGES:
236
+ error = f"Maximum images you can digitize is set to: {MAX_IMAGES}"
237
+ gr.Warning(error)
238
+ raise ValueError(error)
239
 
240
  if name in PIPELINES:
241
  book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
 
250
  station_tf.read_text(),
251
  )
252
 
253
+
254
  def move_uploaded_file(uploaded, table_fmt_filename):
255
  current_directory = Path(uploaded).parent
256
 
 
267
  print(f"Copy created", true_path)
268
  return str(true_path)
269
 
270
+
271
  def get_uploaded_image(
272
+ first_page: int, last_page: int, table_fmt_filename: str, filename: str
273
  ) -> tuple[list[NDArray], io.Book, str, str] | None:
274
 
275
  name, _ext = filename.split(".")
 
286
  filename,
287
  station_tf.read_text(),
288
  )
289
+
290
+
291
  def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
292
  name = book.station_name
293
  table_fmt_dir = Path("table_formats")
 
295
  book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
296
  gr.Info(f"Overwritten table format file for {name}")
297
  return book
 
app/tabs/visualizer.py CHANGED
@@ -1,66 +1,10 @@
1
- import os
2
  import gradio as gr
3
  from jinja2 import Environment, FileSystemLoader
4
- from typing_extensions import TypeAlias
5
 
6
  _ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
7
  _IMAGE_TEMPLATE = _ENV.get_template("image.j2")
8
 
9
- from typing import NamedTuple, TypeAlias
10
- from dawsonia.typing import BBoxTuple
11
-
12
-
13
- class TableCell(NamedTuple):
14
- polygon: tuple[tuple[int, int], ...]
15
- text_x: int
16
- text_y: int
17
- text: str
18
-
19
-
20
- class Page(NamedTuple):
21
- width: int
22
- height: int
23
- cells: list[TableCell]
24
- path: str
25
-
26
-
27
- Collection: TypeAlias = list[Page]
28
-
29
-
30
- def render_image(collection: Collection, current_page_index: int) -> str:
31
- return _IMAGE_TEMPLATE.render(
32
- page=collection[current_page_index],
33
- )
34
-
35
-
36
- def toggle_navigation_button(collection: Collection):
37
- visible = len(collection) > 1
38
- return gr.update(visible=visible)
39
-
40
-
41
- def activate_left_button(current_page_index):
42
- interactive = current_page_index > 0
43
- return gr.update(interactive=interactive)
44
-
45
-
46
- def activate_right_button(collection: Collection, current_page_index):
47
- interactive = current_page_index + 1 < len(collection)
48
- return gr.update(interactive=interactive)
49
-
50
-
51
- def right_button_click(collection: Collection, current_page_index):
52
- max_index = len(collection) - 1
53
- return min(max_index, current_page_index + 1)
54
-
55
-
56
- def left_button_click(current_page_index):
57
- return max(0, current_page_index - 1)
58
-
59
-
60
- def update_image_caption(collection: Collection, current_page_index):
61
- n_pages = len(collection)
62
- label = os.path.split(collection[current_page_index].path)[-1]
63
- return f"image {current_page_index + 1} of {n_pages}: `{label}`"
64
 
65
 
66
  with gr.Blocks() as visualizer:
 
 
1
  import gradio as gr
2
  from jinja2 import Environment, FileSystemLoader
 
3
 
4
  _ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
5
  _IMAGE_TEMPLATE = _ENV.get_template("image.j2")
6
 
7
+ from .visualizer_functions import render_image, toggle_navigation_button, activate_left_button, activate_right_button, right_button_click, left_button_click, update_image_caption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  with gr.Blocks() as visualizer:
app/tabs/visualizer_functions.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from jinja2 import Environment, FileSystemLoader
4
+ from typing_extensions import TypeAlias
5
+
6
+ _ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
7
+ _IMAGE_TEMPLATE = _ENV.get_template("image.j2")
8
+
9
+ from typing import NamedTuple, TypeAlias
10
+
11
+
12
+ class TableCell(NamedTuple):
13
+ polygon: tuple[tuple[int, int], ...]
14
+ text_x: int
15
+ text_y: int
16
+ text: str
17
+
18
+
19
+ class Page(NamedTuple):
20
+ width: int
21
+ height: int
22
+ cells: list[TableCell]
23
+ path: str
24
+
25
+
26
+ Collection: TypeAlias = list[Page]
27
+
28
+
29
+ def render_image(collection: Collection, current_page_index: int) -> str:
30
+ return _IMAGE_TEMPLATE.render(
31
+ page=collection[current_page_index],
32
+ )
33
+
34
+
35
+ def toggle_navigation_button(collection: Collection):
36
+ visible = len(collection) > 1
37
+ return gr.update(visible=visible)
38
+
39
+
40
+ def activate_left_button(current_page_index):
41
+ interactive = current_page_index > 0
42
+ return gr.update(interactive=interactive)
43
+
44
+
45
+ def activate_right_button(collection: Collection, current_page_index):
46
+ interactive = current_page_index + 1 < len(collection)
47
+ return gr.update(interactive=interactive)
48
+
49
+
50
+ def right_button_click(collection: Collection, current_page_index):
51
+ max_index = len(collection) - 1
52
+ return min(max_index, current_page_index + 1)
53
+
54
+
55
+ def left_button_click(current_page_index):
56
+ return max(0, current_page_index - 1)
57
+
58
+
59
+ def update_image_caption(collection: Collection, current_page_index):
60
+ n_pages = len(collection)
61
+ label = os.path.split(collection[current_page_index].path)[-1]
62
+ return f"image {current_page_index + 1} of {n_pages}: `{label}`"