File size: 8,387 Bytes
961b876
 
 
 
 
0918bea
eb20090
961b876
0918bea
961b876
 
0918bea
961b876
 
0918bea
 
961b876
 
 
 
 
0918bea
961b876
 
0918bea
961b876
 
0918bea
961b876
 
 
0918bea
961b876
 
 
 
 
2c6cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb20090
2c6cadb
 
 
 
eb20090
 
961b876
2c6cadb
0918bea
eb20090
2c6cadb
eb20090
 
 
0918bea
2c6cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb20090
2c6cadb
eb20090
2c6cadb
eb20090
 
 
 
2c6cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0918bea
2c6cadb
0918bea
 
 
2c6cadb
eb20090
0918bea
 
 
 
 
 
 
 
 
2c6cadb
0918bea
961b876
0918bea
961b876
 
0918bea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# pdf_processing.py
import fitz  # PyMuPDF
import pymupdf4llm
import os
import traceback
from typing import Any, Dict, List # Use standard List, Dict
from collections import Counter

def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
    """Converts a fitz.Rect object to a dictionary."""
    if not rect or not isinstance(rect, fitz.Rect):
        # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
        return None
    return {
        "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
        "width": rect.width, "height": rect.height
    }

def try_map_issues_to_page_rects(
    issues_to_map_for_context: List[Dict[str, Any]],
    pdf_rects: List[fitz.Rect],
    page_number_for_mapping: int
) -> int:
    mapped_count = 0
    limit = min(len(issues_to_map_for_context), len(pdf_rects))
    for i in range(limit):
        issue_to_update = issues_to_map_for_context[i]
        if issue_to_update['is_mapped_to_pdf']: continue
        pdf_rect = pdf_rects[i]
        coord_dict = convert_rect_to_dict(pdf_rect)
        if coord_dict:
            issue_to_update['pdf_coordinates_list'] = [coord_dict]
            issue_to_update['is_mapped_to_pdf'] = True
            issue_to_update['mapped_page_number'] = page_number_for_mapping
            mapped_count += 1
    return mapped_count


import fitz  # PyMuPDF
import os
import traceback
from typing import Any, Dict, List
from collections import Counter


# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.

import fitz  # PyMuPDF
import os
import traceback
from typing import Any, Dict, List  # Use standard List, Dict
from collections import Counter


# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.

def extract_majority_font_text_directly(pdf_path: str) -> str:
    """
    Extracts text from PDF, identifies the majority font and size,
    and then directly assembles a plain text string containing only the text
    that matches this majority font, attempting to preserve basic structure.
    This method does NOT create an intermediate PDF document.
    """
    original_doc = None
    try:
        # 1. Open PDF and Perform Font Analysis (similar to before)
        original_doc = fitz.open(pdf_path)
        if not original_doc.page_count:
            print("FontFilter (Direct): PDF has no pages.")
            return ""

        font_char_counts: Counter = Counter()
        pdf_basename = os.path.basename(pdf_path)
        print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")

        # First pass: Analyze fonts to find the majority
        for page_num_analysis in range(original_doc.page_count):
            page_analysis = original_doc[page_num_analysis]
            # Using TEXTFLAGS_TEXT for potentially cleaner text from spans
            text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
            for block_analysis in text_dict_analysis.get("blocks", []):
                if block_analysis.get("type") == 0:  # type 0 is a text block
                    for line_analysis in block_analysis.get("lines", []):
                        for span_analysis in line_analysis.get("spans", []):
                            font_name = span_analysis["font"]
                            font_size = span_analysis.get("size")
                            if font_size is None: continue  # Skip if size is not available

                            font_size_rounded = int(round(font_size))
                            text = span_analysis["text"]
                            if not text.strip(): continue  # Skip purely whitespace spans

                            font_char_counts[(font_name, font_size_rounded)] += len(text)

        if not font_char_counts:
            print("FontFilter (Direct): No text with font information found in PDF.")
            return ""

        majority_font_tuple_info = font_char_counts.most_common(1)[0]
        (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
        char_count_for_majority = majority_font_tuple_info[1]
        print(
            f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")

        # 2. Second Pass: Extract and Assemble Text Based on Majority Font
        print(
            f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
        all_pages_collected_text = []  # List to hold text from each page (as a list of block texts)

        for page_num_extraction in range(original_doc.page_count):
            page = original_doc[page_num_extraction]
            # Using flags for potentially better whitespace and ligature handling in extracted text
            text_page_dict = page.get_text("dict",
                                           flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)

            page_blocks_text_parts = []  # Collect text from blocks on this page

            for block in text_page_dict.get("blocks", []):
                if block.get("type") == 0:  # Text block
                    current_block_lines_text_parts = []
                    for line in block.get("lines", []):
                        current_line_spans_text_parts = []
                        for span in line.get("spans", []):
                            # Check if this span matches the majority font
                            current_span_font_name = span["font"]
                            current_span_font_size = span.get("size")

                            if current_span_font_size is not None and \
                                    current_span_font_name == majority_font_name and \
                                    int(round(current_span_font_size)) == majority_font_size_rounded:
                                current_line_spans_text_parts.append(span["text"])

                        if current_line_spans_text_parts:
                            # Join text from selected spans within a line with a single space
                            line_text = " ".join(current_line_spans_text_parts)
                            current_block_lines_text_parts.append(line_text)

                    if current_block_lines_text_parts:
                        # Join lines within a block with a single newline
                        block_text = "\n".join(current_block_lines_text_parts)
                        page_blocks_text_parts.append(block_text)

            if page_blocks_text_parts:
                # Join blocks on a page with a double newline (simulating paragraph breaks)
                all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))

        if not all_pages_collected_text:
            print("FontFilter (Direct): No text matching the majority font was found to extract.")
            return ""

        # Join text from all pages.
        # A page break is already handled by the \n\n between blocks of different pages.
        # If more distinct page separation is needed, a custom separator could be added here.
        final_text = "\n\n".join(all_pages_collected_text)
        print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
        return final_text

    except Exception as e:
        print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if original_doc: original_doc.close()


def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
    """
    Extracts raw plain text from the PDF at pdf_path without any filtering.
    Expects pdf_path to be a valid path to a PDF file.
    """
    doc_orig_text = None
    try:
        doc_orig_text = fitz.open(pdf_path)
        full_text_parts = [page.get_text("text") for page in doc_orig_text]
        print(full_text_parts)
        return "".join(full_text_parts)
    except Exception as e:
        print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if doc_orig_text: doc_orig_text.close()