import pandas as pd import os import re from tools.helper_functions import OUTPUT_FOLDER from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from typing import List, Tuple, Optional, Dict from collections import defaultdict import gradio as gr from gradio import Progress from pathlib import Path from pymupdf import Document from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box import en_core_web_lg nlp = en_core_web_lg.load() similarity_threshold = 0.95 def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER): """ Combines text from multiple CSV files containing page and text columns. Groups text by file and page number, concatenating text within these groups. Args: input_files (list): List of paths to CSV files Returns: pd.DataFrame: Combined dataframe with columns [file, page, text] """ all_data = [] output_files = [] if isinstance(input_files, str): file_paths_list = [input_files] else: file_paths_list = input_files for file in file_paths_list: if isinstance(file, str): file_path = file else: file_path = file.name # Read CSV file df = pd.read_csv(file_path) # Ensure required columns exist if 'page' not in df.columns or 'text' not in df.columns: print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'") continue df['text'] = df['text'].fillna('').astype(str) # Group by page and concatenate text if combine_pages == True: grouped = df.groupby('page')['text'].apply(' '.join).reset_index() else: df['line_number_by_page'] = df.groupby('page').cumcount() + 1 df['original_page'] = df['page'] df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5) df['page'] = df['page'].astype(int) grouped = df #.drop('line_number_by_page', axis=1) # Add filename column grouped['file'] = os.path.basename(file_path) all_data.append(grouped) if not all_data: raise ValueError("No valid CSV files were processed") # Combine all dataframes combined_df = pd.concat(all_data, ignore_index=True) # Reorder columns combined_df = combined_df[['file', 'page', 'text']] output_combined_file_path = output_folder + "combined_ocr_output_files.csv" combined_df.to_csv(output_combined_file_path, index=None) output_files.append(output_combined_file_path) return combined_df, output_files def process_data(df:pd.DataFrame, column:str): ''' Clean and stem text columns in a data frame ''' def _clean_text(raw_text): # Remove HTML tags clean = re.sub(r'<.*?>', '', raw_text) clean = ' '.join(clean.split()) # Join the cleaned words back into a string return clean # Function to apply lemmatisation and remove stopwords def _apply_lemmatization(text): doc = nlp(text) # Keep only alphabetic tokens and remove stopwords lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop] return ' '.join(lemmatized_words) df['text_clean'] = df[column].apply(_clean_text) df['text_clean'] = df['text_clean'].apply(_apply_lemmatization) return df def map_metadata_single_page(similarity_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200): """Helper to map metadata for single page results.""" metadata_df = metadata_source_df[['file', 'page', 'text']] results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\ .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'}) results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\ .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'}) results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3) final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']] final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"]) final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length] final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length] return final_df def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200): """Helper to map metadata for subdocument results.""" metadata_df = metadata_source_df[['file', 'page', 'text']] subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\ .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'}) subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\ .rename(columns={'page': 'Page1_End_Page'}) subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\ .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'}) subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\ .rename(columns={'page': 'Page2_End_Page'}) cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page', 'Page2_File', 'Page2_Start_Page', 'Page2_End_Page', 'Match_Length', 'Page1_Text', 'Page2_Text'] # Add Avg_Similarity if it exists (it won't for greedy match unless we add it) if 'Avg_Similarity' in subdocument_df.columns: subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3) cols.insert(7, 'Avg_Similarity') final_df = subdocument_df[cols] final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page']) final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length] final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length] return final_df def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str, combine_pages:bool = True) -> list: """ Saves the main results DataFrame and generates per-file redaction lists. This function is extracted to be reusable. Args: final_df (pd.DataFrame): The DataFrame containing the final match results. output_folder (str): The folder to save the output files. combine_pages (bool, optional): Boolean to check whether the text from pages have been combined into one, or if instead the duplicate match has been conducted line by line. Returns: list: A list of paths to all generated files. """ output_paths = [] output_folder_path = Path(output_folder) output_folder_path.mkdir(exist_ok=True) if final_df.empty: print("No matches to save.") return [] # 1. Save the main results DataFrame similarity_file_output_path = output_folder_path / 'page_similarity_results.csv' final_df.to_csv(similarity_file_output_path, index=False) output_paths.append(str(similarity_file_output_path)) print(f"Main results saved to {similarity_file_output_path}") # 2. Save per-file redaction lists # Use 'Page2_File' as the source of duplicate content if combine_pages == True: grouping_col = 'Page2_File' if grouping_col not in final_df.columns: print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.") return output_paths for redact_file, group in final_df.groupby(grouping_col): output_file_name_stem = Path(redact_file).stem output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv" all_pages_to_redact = set() is_subdocument_match = 'Page2_Start_Page' in group.columns if is_subdocument_match: for _, row in group.iterrows(): pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1) all_pages_to_redact.update(pages_in_range) else: pages = group['Page2_Page'].unique() all_pages_to_redact.update(pages) if all_pages_to_redact: redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact']) redaction_df.to_csv(output_file_path, header=False, index=False) output_paths.append(str(output_file_path)) print(f"Redaction list for {redact_file} saved to {output_file_path}") return output_paths def identify_similar_pages( df_combined: pd.DataFrame, similarity_threshold: float = 0.9, min_word_count: int = 10, min_consecutive_pages: int = 1, greedy_match: bool = False, combine_pages:bool=True, output_folder: str = OUTPUT_FOLDER, progress=Progress(track_tqdm=True) ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]: """ Identifies similar pages with three possible strategies: 1. Single Page: If greedy_match=False and min_consecutive_pages=1. 2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1. 3. Greedy Consecutive Match: If greedy_match=True. """ output_paths = [] progress(0.1, desc="Processing and filtering text") df = process_data(df_combined, 'text') df['word_count'] = df['text_clean'].str.split().str.len().fillna(0) original_row_count = len(df) df_filtered = df[df['word_count'] >= min_word_count].copy() df_filtered.reset_index(drop=True, inplace=True) print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.") if len(df_filtered) < 2: return pd.DataFrame(), [], df_combined vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean']) progress(0.3, desc="Calculating text similarity") similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) coo_matrix = similarity_matrix.tocoo() # Create a DataFrame of all individual page pairs above the threshold. # This is the base for all three matching strategies. similar_pages = [ (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if r < c and v >= similarity_threshold ] if not similar_pages: return pd.DataFrame(), [], df_combined base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score']) progress(0.6, desc="Aggregating results based on matching strategy") if greedy_match: print("Finding matches using greedy consecutive strategy.") # A set of pairs for fast lookups of (page1_idx, page2_idx) valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index'])) # Keep track of indices that have been used in a sequence consumed_indices_1 = set() consumed_indices_2 = set() all_sequences = [] # Iterate through all potential starting pairs, sorted for consistent results sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index']) for _, row in sorted_pairs.iterrows(): start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index']) # If this pair has already been consumed by a previous sequence, skip it if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2: continue # This is a new sequence, start expanding it current_sequence = [(start_idx1, start_idx2)] k = 1 while True: next_idx1 = start_idx1 + k next_idx2 = start_idx2 + k # Check if the next pair in the sequence is a valid match if (next_idx1, next_idx2) in valid_pairs_set and \ next_idx1 not in consumed_indices_1 and \ next_idx2 not in consumed_indices_2: current_sequence.append((next_idx1, next_idx2)) k += 1 else: # The sequence has ended break # Record the found sequence and mark all its pages as consumed sequence_indices_1 = [p[0] for p in current_sequence] sequence_indices_2 = [p[1] for p in current_sequence] all_sequences.append({ 'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1], 'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1], 'Match_Length': len(current_sequence) }) consumed_indices_1.update(sequence_indices_1) consumed_indices_2.update(sequence_indices_2) if not all_sequences: return pd.DataFrame(), [], df_combined subdocument_df = pd.DataFrame(all_sequences) # We can add back the average similarity if needed, but it requires more lookups. # For now, we'll omit it for simplicity in the greedy approach. # ... (The rest is metadata mapping, same as the subdocument case) elif min_consecutive_pages > 1: # --- STRATEGY 2: Fixed-Length Subdocument Matching --- print(f"Finding consecutive page matches (min_consecutive_pages > 1)") similarity_df = base_similarity_df.copy() similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True) is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1) block_id = is_consecutive.eq(False).cumsum() grouped = similarity_df.groupby(block_id) agg_results = grouped.agg( Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'), Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'), Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean') ).reset_index(drop=True) subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy() if subdocument_df.empty: return pd.DataFrame(), [], df_combined else: # --- STRATEGY 1: Single Page Matching --- print(f"Finding single page matches (min_consecutive_pages=1)") final_df = map_metadata_single_page(base_similarity_df, df_filtered) # The rest of the logic (saving files) is handled after this if/else block pass # The final_df is already prepared # --- Map metadata and format output --- # This block now handles the output for both subdocument strategies (2 and 3) if greedy_match or min_consecutive_pages > 1: final_df = map_metadata_subdocument(subdocument_df, df_filtered) progress(0.8, desc="Saving output files") output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages) return final_df, output_paths, df_combined # ============================================================================== # GRADIO HELPER FUNCTIONS # ============================================================================== # full_data:pd.DataFrame, def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict): """ This single function handles a user selecting a row. It: 1. Determines the selected row index. 2. Calls the show_page_previews function to get the text data. 3. Returns all the necessary outputs for the UI. """ # If the user deselects, the event might be None. if not evt: return None, None, None # Clear state and both preview panes # 1. Get the selected index selected_index = evt.index[0] # 2. Get the preview data page1_data, page2_data = show_page_previews(full_duplicate_data_by_file, results_df, evt) # 3. Return all three outputs in the correct order return selected_index, page1_data, page2_data def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./output/"): """ Removes a selected row from the results DataFrame, regenerates output files, and clears the text preview panes. """ if selected_index is None: gr.Warning("No match selected. Please click on a row in the table first.") # Return the original dataframe and update=False for the files return results_df, gr.update(), None, None if results_df.empty: gr.Warning("No duplicate page results found, nothing to exclude.") return results_df, gr.update(), None, None # Drop the selected row updated_df = results_df.drop(selected_index).reset_index(drop=True) # Recalculate all output files using the helper function new_output_paths = save_results_and_redaction_lists(updated_df, output_folder) gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.") # Return the updated dataframe, the new file list, and clear the preview panes return updated_df, new_output_paths, None, None def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)): """ Wrapper function updated to include the 'greedy_match' boolean. """ if not files: gr.Warning("Please upload files to analyze.") return None, None, None progress(0, desc="Combining input files...") df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages) if df_combined.empty: gr.Warning("No data found in the uploaded files.") return None, None, None # Call the main analysis function with the new parameter results_df, output_paths, full_df = identify_similar_pages( df_combined=df_combined, similarity_threshold=threshold, min_word_count=min_words, min_consecutive_pages=int(min_consecutive), greedy_match=greedy_match, combine_pages=combine_pages, progress=progress ) # Clip text to first 200 characters full_df['text'] = full_df['text'].str[:preview_length] # Preprocess full_data (without preview text) for fast access (run once) full_data_by_file = { file: df.sort_values('page').set_index('page') for file, df in full_df.drop(["text_clean"],axis=1).groupby('file') } if results_df.empty: gr.Info(f"No duplicate pages found, no results returned.") return results_df, output_paths, full_data_by_file # full_df, def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500): """ Optimized version using pre-partitioned and indexed full_data. Triggered when a user selects a row in the results DataFrame. """ if not full_data_by_file or results_df is None or not evt: return None, None selected_row = results_df.iloc[evt.index[0], :] is_subdocument_match = 'Page1_Start_Page' in selected_row if is_subdocument_match: file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page'] file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page'] page1_data = full_data_by_file[file1].loc[start1:end1, ['text']].reset_index() page2_data = full_data_by_file[file2].loc[start2:end2, ['text']].reset_index() else: file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page'] file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page'] page1_data = full_data_by_file[file1].loc[[page1], ['text']].reset_index() page2_data = full_data_by_file[file2].loc[[page2], ['text']].reset_index() page1_data['text'] = page1_data['text'].str[:preview_length] page2_data['text'] = page2_data['text'].str[:preview_length] return page1_data[['page', 'text']], page2_data[['page', 'text']] def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]: """ Finds and returns the size and path information for a specific page. """ return next((size for size in page_sizes if size["page"] == page_num), None) def add_new_annotations_to_existing_page_annotations( all_annotations: List[Dict], image_path: str, new_annotation_boxes: List[Dict] ) -> Tuple[List[Dict], Dict]: """ Adds a list of new annotation boxes to the annotations for a specific page. If the page already has annotations, it extends the list of boxes. If not, it creates a new entry for the page. Args: all_annotations (List[Dict]): The current list of all annotation groups. image_path (str): The identifier for the image/page. new_annotation_boxes (List[Dict]): A list of new annotation boxes to add. Returns: Tuple[List[Dict], Dict]: A tuple containing: - The updated list of all annotation groups. - The annotation group representing the newly added boxes. """ # Find the annotation group for the current page/image current_page_group = next( (annot_group for annot_group in all_annotations if annot_group["image"] == image_path), None ) if current_page_group: # Page already has annotations, so extend the list with the new boxes current_page_group["boxes"].extend(new_annotation_boxes) else: # This is the first set of annotations for this page, create a new group new_group = { "image": image_path, "boxes": new_annotation_boxes } all_annotations.append(new_group) # This object represents all annotations that were just added for this page newly_added_annotation_group = { "image": image_path, "boxes": new_annotation_boxes } return all_annotations, newly_added_annotation_group def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]): ''' Take a list of suggested whole pages to redact and apply it to review file data. ''' all_annotations = all_existing_annotations.copy() if not pymupdf_doc: message = "No document file currently under review." print(f"Warning: {message}") raise Warning(message) list_whole_pages_to_redact = [] if combine_pages == True: # Get list of pages to redact from either dataframe or file if not duplicate_page_numbers_df.empty: list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist() elif duplicate_output_paths: expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}" whole_pages_list = pd.DataFrame() # Initialize empty DataFrame for output_file in duplicate_output_paths: # Note: output_file.name might not be available if output_file is just a string path # If it's a Path object or similar, .name is fine. Otherwise, parse from string. file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name if expected_duplicate_pages_to_redact_name in file_name_from_path: whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path break else: message = "No relevant list of whole pages to redact found." print(message) raise Warning(message) if not whole_pages_list.empty: list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist() list_whole_pages_to_redact = list(set(list_whole_pages_to_redact)) else: if not new_annotations_with_bounding_boxes: message = "Can't find any new annotations to add" print(message) raise Warning(message) list_whole_pages_to_redact = [] for annotation in new_annotations_with_bounding_boxes: match = re.search(r'_(\d+)\.png$', annotation["image"]) if match: page = int(match.group(1)) + 1 list_whole_pages_to_redact.append(page) else: print(f"Warning: Could not extract page number from {annotation['image']}") list_whole_pages_to_redact = list(set(list_whole_pages_to_redact)) new_annotations = [] # Process each page for redaction for page in list_whole_pages_to_redact: try: page_num = int(page) page_index = page_num - 1 if not (0 <= page_index < len(pymupdf_doc)): print(f"Page {page_num} is out of bounds, skipping.") continue page_info = get_page_image_info(page_num, page_sizes) if not page_info: print(f"Page {page_num} not found in page_sizes, skipping.") continue image_path = page_info["image_path"] page_annotation_group = next((g for g in all_annotations if g["image"] == image_path), None) if page_annotation_group and any(box["label"] == "Whole page" for box in page_annotation_group["boxes"]): print(f"Whole page redaction for page {page_num} already exists, skipping.") continue # --- Create a LIST of boxes to add.--- boxes_to_add = [] pymupdf_page = pymupdf_doc[page_index] if combine_pages==True: whole_page_box = redact_whole_pymupdf_page( rect_height=page_info["cropbox_height"], rect_width=page_info["cropbox_width"], page=pymupdf_page, border=0.005, redact_pdf=False ) boxes_to_add.append(whole_page_box) else: # Find the specific annotation group that matches the current page's image path relevant_box_group = next( (group for group in new_annotations_with_bounding_boxes if group.get('image') == image_path), None # Default to None if no match is found ) # Check if we found a matching group of boxes for this page if relevant_box_group: boxes_to_add.extend(relevant_box_group['boxes']) else: # This case would be unexpected, but it's good to handle. # It means a page was in list_whole_pages_to_redact but had no # corresponding boxes generated in new_annotations_with_bounding_boxes. print(f"Warning: No new annotation boxes found for page {page_num} ({image_path}).") # === Use the modified helper function to add a LIST of boxes === all_annotations, new_annotations_for_page = add_new_annotations_to_existing_page_annotations( all_annotations=all_annotations, image_path=image_path, new_annotation_boxes=boxes_to_add # Pass the list here ) new_annotations_for_page = fill_missing_box_ids_each_box(new_annotations_for_page) new_annotations.append(new_annotations_for_page) except Exception as e: print(f"Error processing page {page}: {str(e)}") continue whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations) if whole_page_review_file.empty: message = "No new whole page redactions were added." print(message) gr.Info(message) return review_file_state, all_annotations expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] for col in expected_cols: if col not in review_file_state.columns: review_file_state[col] = pd.NA if col not in whole_page_review_file.columns: whole_page_review_file[col] = pd.NA review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True) review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True) review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first') out_message = "Successfully created whole page redactions." print(out_message) gr.Info(out_message) return review_file_out, all_annotations # --- 1. Helper Function to Parse the Combined Page/Line ID --- def _parse_page_line_id(combined_id: int) -> Tuple[int, int]: """ Parses a combined page and line number ID into a (page, line) tuple. Assumes the ID is a 10-digit number where the first 5 are the page and the last 5 are the line number. Example: 100027 -> (1, 27) 200005 -> (2, 5) """ # zfill ensures the string is padded with leading zeros to 10 characters s_id = str(combined_id).zfill(10) page = int(s_id[:5]) line = int(s_id[5:]) return page, line # def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame): # ''' # Create a set of annotation boxes based on selected ocr_results_df lines. # ''' # annotations_by_page = [] # # --- Build Annotation Boxes for each selected line --- # for _, line_row in ocr_results_df_lines_to_annotate.iterrows(): # # The coordinates are relative, so xmax = left + width and ymax = top + height # box = { # "label": "Similar Text", # Or any other label you prefer # "xmin": line_row['left'], # "ymin": line_row['top'] + line_row['height'], # "xmax": line_row['left'] + line_row['width'], # "ymax": line_row['top'] , # "text": line_row['text'] # } # # --- 6. Group the box by its page number --- # page_number = line_row['page'] # annotations_by_page[page_number].append(box) # return annotations_by_page # def create_annotation_objects_from_duplicates( # duplicates_df: pd.DataFrame, # ocr_results_df: pd.DataFrame, # combine_pages:bool=False # ) -> List[Dict]: # """ # Creates structured annotation objects from selected ocr outputs. # Args: # duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with # columns like 'Page2_Start_Page' and 'Page2_End_Page'. # ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns # 'page', 'text', 'left', 'top', 'width', 'height'. # Returns: # List[Dict]: A list of dictionaries, where each dict represents a page and its # list of annotation boxes, in the format: # [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}] # """ # annotations_by_page = [] # if combine_pages == False: # # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist --- # if 'line_number_by_page' not in ocr_results_df.columns: # print("Generating 'line_number_by_page' for ocr_results_df...") # # Sort by page and original position to ensure correct line numbering # ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True) # ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1 # # Use defaultdict to easily append to lists for each page # annotations_by_page = defaultdict(list) # # --- 3. Iterate through each duplicate range --- # for _, row in duplicates_df.iterrows(): # # Parse the start and end page/line numbers from the duplicate row # start_page, start_line = _parse_page_line_id(row['Page2_Start_Page']) # end_page, end_line = _parse_page_line_id(row['Page2_End_Page']) # # --- 4. Select OCR Lines based on the range --- # # This logic correctly handles ranges within a single page and across multiple pages # if start_page == end_page: # # Simple case: the range is on a single page # condition = ( # (ocr_results_df['page'] == start_page) & # (ocr_results_df['line_number_by_page'].between(start_line, end_line)) # ) # else: # # Complex case: the range spans multiple pages # # Condition for the first page in the range # cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line) # # Condition for all pages between the start and end # cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1) # # Condition for the last page in the range # cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line) # condition = cond_start | cond_middle | cond_end # lines_to_annotate = ocr_results_df[condition] # annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate) # # --- Format the final output list --- # final_output = [] # # Sort by page number for a predictable order # for page, boxes in sorted(annotations_by_page.items()): # final_output.append({ # "page": page, # "boxes": boxes # }) # return final_output def create_annotation_objects_from_duplicates( duplicates_df: pd.DataFrame, ocr_results_df: pd.DataFrame, page_sizes: List[Dict], combine_pages:bool=False ) -> List[Dict]: """ Creates structured annotation objects from duplicate line ranges, mapping page numbers to image paths. Args: duplicates_df (pd.DataFrame): DataFrame with duplicate ranges. ocr_results_df (pd.DataFrame): DataFrame with OCR results. page_sizes (List[Dict]): A list of dictionaries mapping page numbers to image paths and other metadata. Expected format: [{"page": 1, "image_path": "path/to/img.png", ...}] combine_pages (bool): A boolean that determines whether in previous functions, all text from a page was combined (True). This function will only run if this is False. Returns: List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...] """ final_output = [] if combine_pages == False: # --- NEW: Create an efficient lookup map from page number to image path --- page_to_image_map = {item['page']: item['image_path'] for item in page_sizes} # Prepare OCR Data: Add a line number column if it doesn't exist if 'line_number_by_page' not in ocr_results_df.columns: ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True) ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1 annotations_by_page = defaultdict(list) # Iterate through each duplicate range (this logic is unchanged) for _, row in duplicates_df.iterrows(): start_page, start_line = _parse_page_line_id(row['Page2_Start_Page']) end_page, end_line = _parse_page_line_id(row['Page2_End_Page']) # Select OCR Lines based on the range (this logic is unchanged) if start_page == end_page: condition = ( (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'].between(start_line, end_line)) ) else: cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line) cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1) cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line) condition = cond_start | cond_middle | cond_end lines_to_annotate = ocr_results_df[condition] # Build and group annotation boxes by page number (this logic is unchanged) for _, line_row in lines_to_annotate.iterrows(): box = { "label": "Duplicate text", "color": (0,0,0), "xmin": line_row['left'], "ymin": line_row['top'], "xmax": line_row['left'] + line_row['width'], "ymax": line_row['top'] + line_row['height'], "text": line_row['text'], "id": "" # to be filled in after } page_number = line_row['page'] annotations_by_page[page_number].append(box) print("annotations_by_page:", annotations_by_page) # --- Format the final output list using the page-to-image map --- final_output = [] # Sort by page number for a predictable order for page_num, boxes in sorted(annotations_by_page.items()): # Look up the image path using the page number image_path = page_to_image_map.get(page_num) if image_path: page_boxes = { "image": image_path, "boxes": boxes } # Fill in missing IDs for the new data entries page_boxes = fill_missing_box_ids_each_box(page_boxes) # Add the annotation group using 'image' as the key final_output.append(page_boxes) else: # Handle cases where a page might not have a corresponding image path print(f"Warning: Page {page_num} found in OCR data but has no corresponding " f"entry in the 'page_sizes' object. This page's annotations will be skipped.") print("final_output:", final_output) return final_output # --- Example Usage --- # 1. Create your example DataFrames # duplicates_data = { # 'Page1_File': ['doc_a.csv'], # 'Page1_Start_Page': [100009], # 'Page1_End_Page': [100021], # 'Page2_File': ['doc_a.csv'], # 'Page2_Start_Page': [100027], # Page 1, Line 27 # 'Page2_End_Page': [200005], # Page 2, Line 5 # } # duplicates_df = pd.DataFrame(duplicates_data) # ocr_data = { # 'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2 # 'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))], # # Example coordinates (using small, consistent values for demonstration) # 'left': [0.1] * 40, # 'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)], # 'width': [0.8] * 40, # 'height': [0.015] * 40, # } # ocr_results_df = pd.DataFrame(ocr_data) # # 2. Run the function # generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df) # # 3. Print the result # import json # print(json.dumps(generated_annotations, indent=2))