Spaces:

ai-for-obs
/

dawsonia-demo

Running

App Files Files Community

Ashwin V. Mohanan commited on 11 days ago

Commit

d5ea0f1

1 Parent(s): 108c965

Sync on digitize and read from final output directory

Browse files

Files changed (3) hide show

app/tabs/submit_functions.py +65 -27
app/tabs/visualizer.py +1 -57
app/tabs/visualizer_functions.py +62 -0

app/tabs/submit_functions.py CHANGED Viewed

@@ -14,7 +14,7 @@ from numpy.typing import NDArray
 import pandas as pd
 import pooch
-from .visualizer import Page, TableCell
 # Max number of images a user can upload at once
 MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
@@ -79,7 +79,7 @@ def run_dawsonia(
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", FutureWarning)
-        for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
             output_path_page = output_path_book / str(page_number)
             gr.Info(f"Digitizing {page_number = }")
@@ -100,32 +100,63 @@ def run_dawsonia(
                     output_text_fmt=False,
                     debug=False,
                 )
             progress_value = (page_number - first_page) / max(1, last_page - first_page)
-            if results := read_page(
-                output_path_book,
-                str(page_number),
-                prob_thresh,
-                progress,
-                progress_value,
-                table_fmt.preproc.idx_tables_size_verify,
-            ):  # , im_from_gallery[0])
-                page, im = results
-                collection.append(page)
-                images.append(im)
-            else:
-                gr.Info(f"No tables detected in {page_number = }")
-    if final_output_path_book.exists():
-        shutil.rmtree(final_output_path_book)
-    shutil.copytree(output_path_book, final_output_path_book)
     gr.Info("Pages were succesfully digitized ✨")
     # yield collection, images
     yield collection, gr.skip()
 def read_page(
     output_path_book: Path,
     prefix: str,
@@ -156,9 +187,12 @@ def read_page(
         values_array = values_df.values.flatten()
         prob_array = prob_df.values.flatten()
-        # FIXME: hardcoded. Use idx_tables_size_verify and reconstruct bbox_array
-        bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)\
         cells = [
             make_cell(value, bbox)
             for value, prob, bbox in zip(values_array, prob_array, bbox_array)
@@ -199,7 +233,9 @@ def get_selected_example_image(
     station_tf = Path("table_formats", name).with_suffix(".toml")
     if (last_page - first_page) > MAX_IMAGES:
-        raise ValueError(f"Maximum images you can digitize is set to: {MAX_IMAGES}")
     if name in PIPELINES:
         book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
@@ -214,6 +250,7 @@ def get_selected_example_image(
             station_tf.read_text(),
         )
 def move_uploaded_file(uploaded, table_fmt_filename):
     current_directory = Path(uploaded).parent
@@ -230,8 +267,9 @@ def move_uploaded_file(uploaded, table_fmt_filename):
     print(f"Copy created", true_path)
     return str(true_path)
 def get_uploaded_image(
-    first_page:int, last_page:int, table_fmt_filename:str, filename: str
 ) -> tuple[list[NDArray], io.Book, str, str] | None:
     name, _ext = filename.split(".")
@@ -248,7 +286,8 @@ def get_uploaded_image(
         filename,
         station_tf.read_text(),
     )
 def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
     name = book.station_name
     table_fmt_dir = Path("table_formats")
@@ -256,4 +295,3 @@ def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
     book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
     gr.Info(f"Overwritten table format file for {name}")
     return book

 import pandas as pd
 import pooch
+from .visualizer_functions import Page, TableCell
 # Max number of images a user can upload at once
 MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 5))
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", FutureWarning)
+        for page_number in range(first_page, last_page):
             output_path_page = output_path_book / str(page_number)
             gr.Info(f"Digitizing {page_number = }")
                     output_text_fmt=False,
                     debug=False,
                 )
+            _synctree(output_path_book, final_output_path_book)
             progress_value = (page_number - first_page) / max(1, last_page - first_page)
+    # if final_output_path_book.exists():
+    #     shutil.rmtree(final_output_path_book)
+    # shutil.copytree(output_path_book, final_output_path_book)
+    for page_number, im_from_gallery in zip(range(first_page, last_page), gallery):
+        if results := read_page(
+            final_output_path_book,
+            str(page_number),
+            prob_thresh,
+            progress,
+            1.0,
+            table_fmt.preproc.idx_tables_size_verify,
+        ):  # , im_from_gallery[0])
+            page, im = results
+            collection.append(page)
+            images.append(im)
+            yield collection, gr.skip()
+        else:
+            gr.Info(f"No tables detected in {page_number = }")
     gr.Info("Pages were succesfully digitized ✨")
     # yield collection, images
     yield collection, gr.skip()
+def _synctree(source_dir, dest_dir):
+    source_dir = Path(source_dir)
+    dest_dir = Path(dest_dir)
+    if not dest_dir.exists():
+        dest_dir.mkdir(parents=True)
+    for root, _, files in os.walk(source_dir):
+        root = Path(root)
+        relative_root = root.relative_to(source_dir)
+        # Create subdirectories in the destination directory
+        dest_subdir_path = dest_dir / relative_root
+        if not dest_subdir_path.exists():
+            dest_subdir_path.mkdir(parents=True, exist_ok=True)
+        for file_ in files:
+            source_file_path = root / file_
+            dest_file_path = dest_subdir_path / file_
+            # Copy only if the file does not already exist or is newer
+            if (
+                not dest_file_path.exists()
+                or (source_file_path.stat().st_mtime - dest_file_path.stat().st_mtime) > 0
+            ):
+                shutil.copy2(source_file_path, dest_file_path)
 def read_page(
     output_path_book: Path,
     prefix: str,
         values_array = values_df.values.flatten()
         prob_array = prob_df.values.flatten()
+        # FIXME: hardcoded to get upto 2 tables. Use idx_tables_size_verify and reconstruct bbox_array
+        try:
+            bbox_array = np.hstack(table_meta["table_positions"][:2]).reshape(-1, 4)
+        except ValueError:
+            bbox_array = np.reshape(table_meta["table_positions"][0], (-1, 4))
         cells = [
             make_cell(value, bbox)
             for value, prob, bbox in zip(values_array, prob_array, bbox_array)
     station_tf = Path("table_formats", name).with_suffix(".toml")
     if (last_page - first_page) > MAX_IMAGES:
+        error = f"Maximum images you can digitize is set to: {MAX_IMAGES}"
+        gr.Warning(error)
+        raise ValueError(error)
     if name in PIPELINES:
         book_path = pooch.retrieve(**PIPELINES[name], path=DATA_CACHE)
             station_tf.read_text(),
         )
 def move_uploaded_file(uploaded, table_fmt_filename):
     current_directory = Path(uploaded).parent
     print(f"Copy created", true_path)
     return str(true_path)
 def get_uploaded_image(
+    first_page: int, last_page: int, table_fmt_filename: str, filename: str
 ) -> tuple[list[NDArray], io.Book, str, str] | None:
     name, _ext = filename.split(".")
         filename,
         station_tf.read_text(),
     )
 def overwrite_table_format_file(book: io.Book, book_path, table_fmt: str):
     name = book.station_name
     table_fmt_dir = Path("table_formats")
     book.table_format = io.read_specific_table_format(table_fmt_dir, Path(book_path))
     gr.Info(f"Overwritten table format file for {name}")
     return book

app/tabs/visualizer.py CHANGED Viewed

@@ -1,66 +1,10 @@
-import os
 import gradio as gr
 from jinja2 import Environment, FileSystemLoader
-from typing_extensions import TypeAlias
 _ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
 _IMAGE_TEMPLATE = _ENV.get_template("image.j2")
-from typing import NamedTuple, TypeAlias
-from dawsonia.typing import BBoxTuple
-class TableCell(NamedTuple):
-    polygon: tuple[tuple[int, int], ...]
-    text_x: int
-    text_y: int
-    text: str
-class Page(NamedTuple):
-    width: int
-    height: int
-    cells: list[TableCell]
-    path: str
-Collection: TypeAlias = list[Page]
-def render_image(collection: Collection, current_page_index: int) -> str:
-    return _IMAGE_TEMPLATE.render(
-        page=collection[current_page_index],
-    )
-def toggle_navigation_button(collection: Collection):
-    visible = len(collection) > 1
-    return gr.update(visible=visible)
-def activate_left_button(current_page_index):
-    interactive = current_page_index > 0
-    return gr.update(interactive=interactive)
-def activate_right_button(collection: Collection, current_page_index):
-    interactive = current_page_index + 1 < len(collection)
-    return gr.update(interactive=interactive)
-def right_button_click(collection: Collection, current_page_index):
-    max_index = len(collection) - 1
-    return min(max_index, current_page_index + 1)
-def left_button_click(current_page_index):
-    return max(0, current_page_index - 1)
-def update_image_caption(collection: Collection, current_page_index):
-    n_pages = len(collection)
-    label = os.path.split(collection[current_page_index].path)[-1]
-    return f"image {current_page_index + 1} of {n_pages}: `{label}`"
 with gr.Blocks() as visualizer:

 import gradio as gr
 from jinja2 import Environment, FileSystemLoader
 _ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
 _IMAGE_TEMPLATE = _ENV.get_template("image.j2")
+from .visualizer_functions import render_image, toggle_navigation_button, activate_left_button, activate_right_button, right_button_click, left_button_click, update_image_caption
 with gr.Blocks() as visualizer:

app/tabs/visualizer_functions.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import gradio as gr
+from jinja2 import Environment, FileSystemLoader
+from typing_extensions import TypeAlias
+_ENV = Environment(loader=FileSystemLoader("app/assets/jinja-templates"))
+_IMAGE_TEMPLATE = _ENV.get_template("image.j2")
+from typing import NamedTuple, TypeAlias
+class TableCell(NamedTuple):
+    polygon: tuple[tuple[int, int], ...]
+    text_x: int
+    text_y: int
+    text: str
+class Page(NamedTuple):
+    width: int
+    height: int
+    cells: list[TableCell]
+    path: str
+Collection: TypeAlias = list[Page]
+def render_image(collection: Collection, current_page_index: int) -> str:
+    return _IMAGE_TEMPLATE.render(
+        page=collection[current_page_index],
+    )
+def toggle_navigation_button(collection: Collection):
+    visible = len(collection) > 1
+    return gr.update(visible=visible)
+def activate_left_button(current_page_index):
+    interactive = current_page_index > 0
+    return gr.update(interactive=interactive)
+def activate_right_button(collection: Collection, current_page_index):
+    interactive = current_page_index + 1 < len(collection)
+    return gr.update(interactive=interactive)
+def right_button_click(collection: Collection, current_page_index):
+    max_index = len(collection) - 1
+    return min(max_index, current_page_index + 1)
+def left_button_click(current_page_index):
+    return max(0, current_page_index - 1)
+def update_image_caption(collection: Collection, current_page_index):
+    n_pages = len(collection)
+    label = os.path.split(collection[current_page_index].path)[-1]
+    return f"image {current_page_index + 1} of {n_pages}: `{label}`"