File size: 40,822 Bytes
a265560
 
 
0ea8b9e
a265560
 
c8ffcd4
 
ab04c92
0ea8b9e
ab04c92
f47b137
c8ffcd4
ab04c92
c8ffcd4
0ea8b9e
a265560
ab04c92
a265560
e424038
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7907ad4
 
a265560
 
e424038
 
 
 
c8ffcd4
e424038
 
 
c8ffcd4
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ea8b9e
a265560
 
 
 
 
 
 
 
 
0ea8b9e
 
f47b137
0ea8b9e
 
 
 
 
 
a265560
0ea8b9e
 
a265560
 
 
f47b137
ab04c92
 
 
 
 
 
 
 
 
f47b137
 
ab04c92
 
f47b137
ab04c92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
 
 
ab04c92
 
e424038
f47b137
 
 
 
 
 
 
e424038
f47b137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e424038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
e424038
 
f47b137
 
 
ab04c92
 
 
 
 
e424038
 
ab04c92
 
 
 
 
 
 
 
 
f47b137
a265560
ab04c92
 
 
 
 
 
0ea8b9e
ab04c92
a265560
ab04c92
 
 
0ea8b9e
ab04c92
a265560
0ea8b9e
ab04c92
0ea8b9e
 
ab04c92
 
 
 
 
 
 
 
 
0ea8b9e
ab04c92
a265560
ab04c92
0ea8b9e
ab04c92
f47b137
ab04c92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a265560
ab04c92
 
 
 
 
 
 
 
 
 
 
 
 
 
e424038
0ea8b9e
ab04c92
0ea8b9e
ab04c92
 
 
0ea8b9e
f47b137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8ffcd4
ab04c92
 
 
 
 
 
 
 
c8ffcd4
ab04c92
 
 
 
 
 
 
 
 
 
 
f47b137
c8ffcd4
ab04c92
 
f47b137
 
 
 
 
 
 
 
 
 
 
 
ab04c92
f47b137
0ea8b9e
f47b137
ab04c92
f47b137
ab04c92
 
f47b137
 
 
 
a265560
ab04c92
 
 
 
 
 
f47b137
 
 
ab04c92
 
 
0ea8b9e
f47b137
 
0ea8b9e
f47b137
 
a265560
f47b137
a265560
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
c8ffcd4
f47b137
 
a265560
c8ffcd4
 
 
 
a265560
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
 
 
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
 
 
 
 
c8ffcd4
 
 
 
f47b137
 
c8ffcd4
 
 
f47b137
 
c8ffcd4
 
 
 
 
 
 
 
f47b137
c8ffcd4
f47b137
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
c8ffcd4
 
f47b137
 
 
 
 
 
c8ffcd4
 
 
 
 
 
 
f47b137
 
c8ffcd4
 
f47b137
 
c8ffcd4
 
 
 
 
 
 
 
f47b137
c8ffcd4
 
 
 
 
 
 
f47b137
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f47b137
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a265560
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a265560
c8ffcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
import pandas as pd
import os
import re
from tools.helper_functions import OUTPUT_FOLDER
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple, Optional, Dict
from collections import defaultdict
import gradio as gr
from gradio import Progress
from pathlib import Path
from pymupdf import Document
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
import en_core_web_lg

nlp = en_core_web_lg.load()

similarity_threshold = 0.95

def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER):
    """
    Combines text from multiple CSV files containing page and text columns.
    Groups text by file and page number, concatenating text within these groups.
    
    Args:
        input_files (list): List of paths to CSV files
    
    Returns:
        pd.DataFrame: Combined dataframe with columns [file, page, text]
    """
    all_data = []
    output_files = []

    if isinstance(input_files, str):
        file_paths_list = [input_files]
    else:
        file_paths_list = input_files
    
    for file in file_paths_list:

        if isinstance(file, str):
            file_path = file
        else:
            file_path = file.name

        # Read CSV file
        df = pd.read_csv(file_path)
        
        # Ensure required columns exist
        if 'page' not in df.columns or 'text' not in df.columns:
            print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
            continue

        df['text'] = df['text'].fillna('').astype(str)
        
        # Group by page and concatenate text
        if combine_pages == True:
            grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
        else:
            df['line_number_by_page'] = df.groupby('page').cumcount() + 1
            df['original_page'] = df['page']
            df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
            df['page'] = df['page'].astype(int)

            grouped = df #.drop('line_number_by_page', axis=1)
        
        # Add filename column
        grouped['file'] = os.path.basename(file_path)
        
        all_data.append(grouped)
    
    if not all_data:
        raise ValueError("No valid CSV files were processed")
    
    # Combine all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Reorder columns
    combined_df = combined_df[['file', 'page', 'text']]

    output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
    combined_df.to_csv(output_combined_file_path, index=None)

    output_files.append(output_combined_file_path)
    
    return combined_df, output_files

def process_data(df:pd.DataFrame, column:str):
    '''
    Clean and stem text columns in a data frame
    '''
    
    def _clean_text(raw_text):
        # Remove HTML tags
        clean = re.sub(r'<.*?>', '', raw_text)
        clean = ' '.join(clean.split())
        # Join the cleaned words back into a string
        return clean

    # Function to apply lemmatisation and remove stopwords
    def _apply_lemmatization(text):
        doc = nlp(text)
        # Keep only alphabetic tokens and remove stopwords
        lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        return ' '.join(lemmatized_words)
    
    df['text_clean'] = df[column].apply(_clean_text)

    df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
    
    return df

def map_metadata_single_page(similarity_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
    """Helper to map metadata for single page results."""
    metadata_df = metadata_source_df[['file', 'page', 'text']]
    results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
                            .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
    results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
                            .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
    results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
    final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
    final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
    final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
    final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]
    return final_df

def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.DataFrame, preview_length:int=200):
    """Helper to map metadata for subdocument results."""
    metadata_df = metadata_source_df[['file', 'page', 'text']]
    
    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
                                   .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
                                   .rename(columns={'page': 'Page1_End_Page'})
    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
                                   .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
                                   .rename(columns={'page': 'Page2_End_Page'})

    cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
            'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
            'Match_Length', 'Page1_Text', 'Page2_Text']
            
    # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
    if 'Avg_Similarity' in subdocument_df.columns:
        subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
        cols.insert(7, 'Avg_Similarity')

    final_df = subdocument_df[cols]
    final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
    final_df['Page1_Text'] = final_df['Page1_Text'].str[:preview_length]
    final_df['Page2_Text'] = final_df['Page2_Text'].str[:preview_length]

    return final_df

def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str, combine_pages:bool = True) -> list:
    """
    Saves the main results DataFrame and generates per-file redaction lists.
    This function is extracted to be reusable.

    Args:
        final_df (pd.DataFrame): The DataFrame containing the final match results.
        output_folder (str): The folder to save the output files.
        combine_pages (bool, optional): Boolean to check whether the text from pages have been combined into one, or if instead the duplicate match has been conducted line by line.

    Returns:
        list: A list of paths to all generated files.
    """
    output_paths = []
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(exist_ok=True)

    if final_df.empty:
        print("No matches to save.")
        return []

    # 1. Save the main results DataFrame
    similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
    final_df.to_csv(similarity_file_output_path, index=False)

    output_paths.append(str(similarity_file_output_path))
    print(f"Main results saved to {similarity_file_output_path}")

    # 2. Save per-file redaction lists
    # Use 'Page2_File' as the source of duplicate content
    if combine_pages == True:
        grouping_col = 'Page2_File'
        if grouping_col not in final_df.columns:
            print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
            return output_paths

        for redact_file, group in final_df.groupby(grouping_col):
            output_file_name_stem = Path(redact_file).stem
            output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
            
            all_pages_to_redact = set()
            is_subdocument_match = 'Page2_Start_Page' in group.columns

            if is_subdocument_match:
                for _, row in group.iterrows():
                    pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
                    all_pages_to_redact.update(pages_in_range)
            else:
                pages = group['Page2_Page'].unique()
                all_pages_to_redact.update(pages)
            
            if all_pages_to_redact:
                redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
                redaction_df.to_csv(output_file_path, header=False, index=False)

                output_paths.append(str(output_file_path))
                print(f"Redaction list for {redact_file} saved to {output_file_path}")
            
    return output_paths

def identify_similar_pages(
    df_combined: pd.DataFrame,
    similarity_threshold: float = 0.9,
    min_word_count: int = 10,
    min_consecutive_pages: int = 1,
    greedy_match: bool = False,
    combine_pages:bool=True,
    output_folder: str = OUTPUT_FOLDER,
    progress=Progress(track_tqdm=True)
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
    """
    Identifies similar pages with three possible strategies:
    1. Single Page: If greedy_match=False and min_consecutive_pages=1.
    2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
    3. Greedy Consecutive Match: If greedy_match=True.
    """

    output_paths = []
    progress(0.1, desc="Processing and filtering text")
    df = process_data(df_combined, 'text')
    df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
    original_row_count = len(df)
    df_filtered = df[df['word_count'] >= min_word_count].copy()
    df_filtered.reset_index(drop=True, inplace=True)
    
    print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")

    if len(df_filtered) < 2:
        return pd.DataFrame(), [], df_combined
        
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])

    progress(0.3, desc="Calculating text similarity")
    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
    coo_matrix = similarity_matrix.tocoo()
    
    # Create a DataFrame of all individual page pairs above the threshold.
    # This is the base for all three matching strategies.
    similar_pages = [
        (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
        if r < c and v >= similarity_threshold
    ]

    if not similar_pages:
        return pd.DataFrame(), [], df_combined
    
    base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])

    progress(0.6, desc="Aggregating results based on matching strategy")
    
    if greedy_match:
        print("Finding matches using greedy consecutive strategy.")
        
        # A set of pairs for fast lookups of (page1_idx, page2_idx)
        valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
        
        # Keep track of indices that have been used in a sequence
        consumed_indices_1 = set()
        consumed_indices_2 = set()
        
        all_sequences = []

        # Iterate through all potential starting pairs, sorted for consistent results
        sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])

        for _, row in sorted_pairs.iterrows():
            start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
            
            # If this pair has already been consumed by a previous sequence, skip it
            if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
                continue

            # This is a new sequence, start expanding it
            current_sequence = [(start_idx1, start_idx2)]
            k = 1
            while True:
                next_idx1 = start_idx1 + k
                next_idx2 = start_idx2 + k
                
                # Check if the next pair in the sequence is a valid match
                if (next_idx1, next_idx2) in valid_pairs_set and \
                   next_idx1 not in consumed_indices_1 and \
                   next_idx2 not in consumed_indices_2:
                    current_sequence.append((next_idx1, next_idx2))
                    k += 1
                else:
                    # The sequence has ended
                    break
            
            # Record the found sequence and mark all its pages as consumed
            sequence_indices_1 = [p[0] for p in current_sequence]
            sequence_indices_2 = [p[1] for p in current_sequence]
            
            all_sequences.append({
                'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
                'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
                'Match_Length': len(current_sequence)
            })

            consumed_indices_1.update(sequence_indices_1)
            consumed_indices_2.update(sequence_indices_2)

        if not all_sequences:
            return pd.DataFrame(), [], df_combined

        subdocument_df = pd.DataFrame(all_sequences)
        # We can add back the average similarity if needed, but it requires more lookups.
        # For now, we'll omit it for simplicity in the greedy approach.
        # ... (The rest is metadata mapping, same as the subdocument case)

    elif min_consecutive_pages > 1:
        # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
        print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
        similarity_df = base_similarity_df.copy()
        similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
        is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
        block_id = is_consecutive.eq(False).cumsum()
        grouped = similarity_df.groupby(block_id)
        agg_results = grouped.agg(
            Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
            Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
            Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
        ).reset_index(drop=True)
        subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
        if subdocument_df.empty: return pd.DataFrame(), [], df_combined

    else:
        # --- STRATEGY 1: Single Page Matching ---
        print(f"Finding single page matches (min_consecutive_pages=1)")
        final_df = map_metadata_single_page(base_similarity_df, df_filtered)
        # The rest of the logic (saving files) is handled after this if/else block
        pass # The final_df is already prepared

    # --- Map metadata and format output ---
    # This block now handles the output for both subdocument strategies (2 and 3)
    if greedy_match or min_consecutive_pages > 1:
        final_df = map_metadata_subdocument(subdocument_df, df_filtered)
    
    progress(0.8, desc="Saving output files")
    
    output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)

    return final_df, output_paths, df_combined

# ==============================================================================
# GRADIO HELPER FUNCTIONS
# ==============================================================================

# full_data:pd.DataFrame, 
def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
    """
    This single function handles a user selecting a row. It:
    1. Determines the selected row index.
    2. Calls the show_page_previews function to get the text data.
    3. Returns all the necessary outputs for the UI.
    """
    # If the user deselects, the event might be None.
    if not evt:
        return None, None, None # Clear state and both preview panes

    # 1. Get the selected index
    selected_index = evt.index[0]

    # 2. Get the preview data
    page1_data, page2_data = show_page_previews(full_duplicate_data_by_file, results_df, evt)

    # 3. Return all three outputs in the correct order
    return selected_index, page1_data, page2_data

def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./output/"):
    """
    Removes a selected row from the results DataFrame, regenerates output files,
    and clears the text preview panes.
    """
    if selected_index is None:
        gr.Warning("No match selected. Please click on a row in the table first.")
        # Return the original dataframe and update=False for the files
        return results_df, gr.update(), None, None
    
    if results_df.empty:
        gr.Warning("No duplicate page results found, nothing to exclude.")
        return results_df, gr.update(), None, None

    # Drop the selected row
    updated_df = results_df.drop(selected_index).reset_index(drop=True)
    
    # Recalculate all output files using the helper function
    new_output_paths = save_results_and_redaction_lists(updated_df, output_folder)
        
    gr.Info(f"Match at row {selected_index} excluded. Output files have been updated.")
    
    # Return the updated dataframe, the new file list, and clear the preview panes
    return updated_df, new_output_paths, None, None

def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
    """
    Wrapper function updated to include the 'greedy_match' boolean.
    """
    if not files:
        gr.Warning("Please upload files to analyze.")
        return None, None, None
        
    progress(0, desc="Combining input files...")
    df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)

    if df_combined.empty:
        gr.Warning("No data found in the uploaded files.")
        return None, None, None

    # Call the main analysis function with the new parameter
    results_df, output_paths, full_df = identify_similar_pages(
        df_combined=df_combined,
        similarity_threshold=threshold,
        min_word_count=min_words,
        min_consecutive_pages=int(min_consecutive),
        greedy_match=greedy_match,
        combine_pages=combine_pages,
        progress=progress
    )

    # Clip text to first 200 characters
    full_df['text'] = full_df['text'].str[:preview_length]

    # Preprocess full_data (without preview text) for fast access (run once)
    full_data_by_file = {
    file: df.sort_values('page').set_index('page')
    for file, df in full_df.drop(["text_clean"],axis=1).groupby('file')
    }

    if results_df.empty:
        gr.Info(f"No duplicate pages found, no results returned.")
    
    return results_df, output_paths, full_data_by_file # full_df, 

def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
    """
    Optimized version using pre-partitioned and indexed full_data.
    Triggered when a user selects a row in the results DataFrame.
    """
    if not full_data_by_file or results_df is None or not evt:
        return None, None

    selected_row = results_df.iloc[evt.index[0], :]

    is_subdocument_match = 'Page1_Start_Page' in selected_row

    if is_subdocument_match:
        file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
        file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']

        page1_data = full_data_by_file[file1].loc[start1:end1, ['text']].reset_index()
        page2_data = full_data_by_file[file2].loc[start2:end2, ['text']].reset_index()

    else:
        file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
        file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']

        page1_data = full_data_by_file[file1].loc[[page1], ['text']].reset_index()
        page2_data = full_data_by_file[file2].loc[[page2], ['text']].reset_index()

    page1_data['text'] = page1_data['text'].str[:preview_length]
    page2_data['text'] = page2_data['text'].str[:preview_length]

    return page1_data[['page', 'text']], page2_data[['page', 'text']]

def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
    """
    Finds and returns the size and path information for a specific page.
    """
    return next((size for size in page_sizes if size["page"] == page_num), None)

def add_new_annotations_to_existing_page_annotations(
    all_annotations: List[Dict],
    image_path: str,
    new_annotation_boxes: List[Dict]
) -> Tuple[List[Dict], Dict]:
    """
    Adds a list of new annotation boxes to the annotations for a specific page.

    If the page already has annotations, it extends the list of boxes. If not,
    it creates a new entry for the page.

    Args:
        all_annotations (List[Dict]): The current list of all annotation groups.
        image_path (str): The identifier for the image/page.
        new_annotation_boxes (List[Dict]): A list of new annotation boxes to add.

    Returns:
        Tuple[List[Dict], Dict]: A tuple containing:
            - The updated list of all annotation groups.
            - The annotation group representing the newly added boxes.
    """
    # Find the annotation group for the current page/image
    current_page_group = next(
        (annot_group for annot_group in all_annotations if annot_group["image"] == image_path),
        None
    )

    if current_page_group:
        # Page already has annotations, so extend the list with the new boxes
        current_page_group["boxes"].extend(new_annotation_boxes)
    else:
        # This is the first set of annotations for this page, create a new group
        new_group = {
            "image": image_path,
            "boxes": new_annotation_boxes
        }
        all_annotations.append(new_group)

    # This object represents all annotations that were just added for this page
    newly_added_annotation_group = {
        "image": image_path,
        "boxes": new_annotation_boxes
    }

    return all_annotations, newly_added_annotation_group

def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]):
    '''
    Take a list of suggested whole pages to redact and apply it to review file data.
    '''
    all_annotations = all_existing_annotations.copy()

    if not pymupdf_doc:
        message = "No document file currently under review."
        print(f"Warning: {message}")
        raise Warning(message)

    list_whole_pages_to_redact = []    

    if combine_pages == True:
        # Get list of pages to redact from either dataframe or file
        if not duplicate_page_numbers_df.empty:
            list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
        elif duplicate_output_paths:
            expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
            whole_pages_list = pd.DataFrame()  # Initialize empty DataFrame
            
            for output_file in duplicate_output_paths:
                # Note: output_file.name might not be available if output_file is just a string path
                # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
                file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
                if expected_duplicate_pages_to_redact_name in file_name_from_path:
                    whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
                    break       
        else:
            message = "No relevant list of whole pages to redact found."
            print(message)
            raise Warning(message)
        
        if not whole_pages_list.empty:
            list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
        
        list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))

    else:
        if not new_annotations_with_bounding_boxes:
            message = "Can't find any new annotations to add"
            print(message)
            raise Warning(message)
        
        list_whole_pages_to_redact = []
        for annotation in new_annotations_with_bounding_boxes:
            match = re.search(r'_(\d+)\.png$', annotation["image"])
            if match:
                page = int(match.group(1)) + 1
                list_whole_pages_to_redact.append(page)
            else:
                print(f"Warning: Could not extract page number from {annotation['image']}")

        list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))

    
    new_annotations = []
    # Process each page for redaction
    for page in list_whole_pages_to_redact:
        try:
            page_num = int(page)
            page_index = page_num - 1
            if not (0 <= page_index < len(pymupdf_doc)):
                print(f"Page {page_num} is out of bounds, skipping.")
                continue

            page_info = get_page_image_info(page_num, page_sizes)
            if not page_info:
                print(f"Page {page_num} not found in page_sizes, skipping.")
                continue

            image_path = page_info["image_path"]
            page_annotation_group = next((g for g in all_annotations if g["image"] == image_path), None)
            if page_annotation_group and any(box["label"] == "Whole page" for box in page_annotation_group["boxes"]):
                print(f"Whole page redaction for page {page_num} already exists, skipping.")
                continue
                
            # --- Create a LIST of boxes to add.---
            boxes_to_add = []
            
            pymupdf_page = pymupdf_doc[page_index]

            if combine_pages==True:
                whole_page_box = redact_whole_pymupdf_page(
                    rect_height=page_info["cropbox_height"],
                    rect_width=page_info["cropbox_width"],
                    page=pymupdf_page, border=0.005, redact_pdf=False
                )
                boxes_to_add.append(whole_page_box)
            else:
                # Find the specific annotation group that matches the current page's image path
                relevant_box_group = next(
                    (group for group in new_annotations_with_bounding_boxes if group.get('image') == image_path), 
                    None  # Default to None if no match is found
                )
                
                # Check if we found a matching group of boxes for this page
                if relevant_box_group:
                    boxes_to_add.extend(relevant_box_group['boxes'])
                else:
                    # This case would be unexpected, but it's good to handle.
                    # It means a page was in list_whole_pages_to_redact but had no
                    # corresponding boxes generated in new_annotations_with_bounding_boxes.
                    print(f"Warning: No new annotation boxes found for page {page_num} ({image_path}).")
            
            # === Use the modified helper function to add a LIST of boxes ===
            all_annotations, new_annotations_for_page = add_new_annotations_to_existing_page_annotations(
                all_annotations=all_annotations,
                image_path=image_path,
                new_annotation_boxes=boxes_to_add  # Pass the list here
            )

            new_annotations_for_page = fill_missing_box_ids_each_box(new_annotations_for_page)
            new_annotations.append(new_annotations_for_page)

        except Exception as e:
            print(f"Error processing page {page}: {str(e)}")
            continue

    whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)

    if whole_page_review_file.empty:
        message = "No new whole page redactions were added."
        print(message)
        gr.Info(message)
        return review_file_state, all_annotations

    expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
    for col in expected_cols:
        if col not in review_file_state.columns: review_file_state[col] = pd.NA
        if col not in whole_page_review_file.columns: whole_page_review_file[col] = pd.NA

    review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
    review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
    review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
    
    out_message = "Successfully created whole page redactions."
    print(out_message)
    gr.Info(out_message)

    return review_file_out, all_annotations


# --- 1. Helper Function to Parse the Combined Page/Line ID ---
def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
    """
    Parses a combined page and line number ID into a (page, line) tuple.
    Assumes the ID is a 10-digit number where the first 5 are the page
    and the last 5 are the line number.
    
    Example: 100027 -> (1, 27)
             200005 -> (2, 5)
    """
    # zfill ensures the string is padded with leading zeros to 10 characters
    s_id = str(combined_id).zfill(10)
    page = int(s_id[:5])
    line = int(s_id[5:])
    return page, line

# def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame):
#     '''
#     Create a set of annotation boxes based on selected ocr_results_df lines. 
#     '''
#     annotations_by_page = []

#     # --- Build Annotation Boxes for each selected line ---
#     for _, line_row in ocr_results_df_lines_to_annotate.iterrows():
#         # The coordinates are relative, so xmax = left + width and ymax = top + height
#         box = {
#             "label": "Similar Text", # Or any other label you prefer
#             "xmin": line_row['left'],
#             "ymin": line_row['top'] + line_row['height'],
#             "xmax": line_row['left'] + line_row['width'],
#             "ymax": line_row['top'] ,
#             "text": line_row['text']
#         }
#         # --- 6. Group the box by its page number ---
#         page_number = line_row['page']
#         annotations_by_page[page_number].append(box)

#     return annotations_by_page

# def create_annotation_objects_from_duplicates(
#     duplicates_df: pd.DataFrame, 
#     ocr_results_df: pd.DataFrame,
#     combine_pages:bool=False
# ) -> List[Dict]:
#     """
#     Creates structured annotation objects from selected ocr outputs.

#     Args:
#         duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with
#                                       columns like 'Page2_Start_Page' and 'Page2_End_Page'.
#         ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns
#                                        'page', 'text', 'left', 'top', 'width', 'height'.

#     Returns:
#         List[Dict]: A list of dictionaries, where each dict represents a page and its
#                     list of annotation boxes, in the format:
#                     [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}]
#     """
#     annotations_by_page = []

#     if combine_pages == False:

#         # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist ---
#         if 'line_number_by_page' not in ocr_results_df.columns:
#             print("Generating 'line_number_by_page' for ocr_results_df...")
#             # Sort by page and original position to ensure correct line numbering
#             ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
#             ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
            
#         # Use defaultdict to easily append to lists for each page
#         annotations_by_page = defaultdict(list)

#         # --- 3. Iterate through each duplicate range ---
#         for _, row in duplicates_df.iterrows():
#             # Parse the start and end page/line numbers from the duplicate row
#             start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
#             end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
            
#             # --- 4. Select OCR Lines based on the range ---
#             # This logic correctly handles ranges within a single page and across multiple pages
#             if start_page == end_page:
#                 # Simple case: the range is on a single page
#                 condition = (
#                     (ocr_results_df['page'] == start_page) &
#                     (ocr_results_df['line_number_by_page'].between(start_line, end_line))
#                 )
#             else:
#                 # Complex case: the range spans multiple pages
#                 # Condition for the first page in the range
#                 cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
#                 # Condition for all pages between the start and end
#                 cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
#                 # Condition for the last page in the range
#                 cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
                
#                 condition = cond_start | cond_middle | cond_end

#             lines_to_annotate = ocr_results_df[condition]        
            
#             annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate)
                
#         # --- Format the final output list ---
#         final_output = []
#         # Sort by page number for a predictable order
#         for page, boxes in sorted(annotations_by_page.items()):
#             final_output.append({
#                 "page": page,
#                 "boxes": boxes
#             })
        
#     return final_output

def create_annotation_objects_from_duplicates(
    duplicates_df: pd.DataFrame, 
    ocr_results_df: pd.DataFrame,
    page_sizes: List[Dict],
    combine_pages:bool=False
) -> List[Dict]:
    """
    Creates structured annotation objects from duplicate line ranges, mapping
    page numbers to image paths.

    Args:
        duplicates_df (pd.DataFrame): DataFrame with duplicate ranges.
        ocr_results_df (pd.DataFrame): DataFrame with OCR results.
        page_sizes (List[Dict]): A list of dictionaries mapping page numbers to image paths and other metadata. Expected format: [{"page": 1, "image_path": "path/to/img.png", ...}]
        combine_pages (bool): A boolean that determines whether in previous functions, all text from a page was combined (True). This function will only run if this is False.

    Returns:
        List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
    """
    final_output = []

    if combine_pages == False:
        # --- NEW: Create an efficient lookup map from page number to image path ---
        page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}

        # Prepare OCR Data: Add a line number column if it doesn't exist
        if 'line_number_by_page' not in ocr_results_df.columns:
            ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
            ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
            
        annotations_by_page = defaultdict(list)

        # Iterate through each duplicate range (this logic is unchanged)
        for _, row in duplicates_df.iterrows():
            start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
            end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
            
            # Select OCR Lines based on the range (this logic is unchanged)
            if start_page == end_page:
                condition = (
                    (ocr_results_df['page'] == start_page) &
                    (ocr_results_df['line_number_by_page'].between(start_line, end_line))
                )
            else:
                cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
                cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
                cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
                condition = cond_start | cond_middle | cond_end

            lines_to_annotate = ocr_results_df[condition]

            # Build and group annotation boxes by page number (this logic is unchanged)
            for _, line_row in lines_to_annotate.iterrows():
                box = {
                    "label": "Duplicate text",
                    "color": (0,0,0),
                    "xmin": line_row['left'],
                    "ymin": line_row['top'],
                    "xmax": line_row['left'] + line_row['width'],
                    "ymax": line_row['top'] + line_row['height'],
                    "text": line_row['text'],
                    "id": "" # to be filled in after
                }
                page_number = line_row['page']

                
                annotations_by_page[page_number].append(box)

        print("annotations_by_page:", annotations_by_page) 
                
        # --- Format the final output list using the page-to-image map ---
        final_output = []
        # Sort by page number for a predictable order
        for page_num, boxes in sorted(annotations_by_page.items()):
            # Look up the image path using the page number
            image_path = page_to_image_map.get(page_num)
            
            if image_path:
                page_boxes = {
                    "image": image_path,
                    "boxes": boxes
                }

                # Fill in missing IDs for the new data entries
                page_boxes = fill_missing_box_ids_each_box(page_boxes)

                # Add the annotation group using 'image' as the key
                final_output.append(page_boxes)
            else:
                # Handle cases where a page might not have a corresponding image path
                print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
                    f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
                
    print("final_output:", final_output)
        
    return final_output

# --- Example Usage ---

# 1. Create your example DataFrames
# duplicates_data = {
#     'Page1_File': ['doc_a.csv'],
#     'Page1_Start_Page': [100009],
#     'Page1_End_Page': [100021],
#     'Page2_File': ['doc_a.csv'],
#     'Page2_Start_Page': [100027], # Page 1, Line 27
#     'Page2_End_Page': [200005],   # Page 2, Line 5
# }
# duplicates_df = pd.DataFrame(duplicates_data)

# ocr_data = {
#     'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2
#     'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))],
#     # Example coordinates (using small, consistent values for demonstration)
#     'left': [0.1] * 40,
#     'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)],
#     'width': [0.8] * 40,
#     'height': [0.015] * 40,
# }
# ocr_results_df = pd.DataFrame(ocr_data)


# # 2. Run the function
# generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df)

# # 3. Print the result
# import json
# print(json.dumps(generated_annotations, indent=2))