File size: 6,361 Bytes
961b876
 
 
 
 
0918bea
eb20090
961b876
0918bea
961b876
 
0918bea
961b876
 
0918bea
 
961b876
 
 
 
 
0918bea
961b876
 
0918bea
961b876
 
0918bea
961b876
 
 
0918bea
961b876
 
 
 
 
0918bea
eb20090
0918bea
 
 
eb20090
 
0918bea
961b876
0918bea
eb20090
0918bea
eb20090
 
0918bea
eb20090
 
0918bea
 
eb20090
 
 
 
0918bea
eb20090
 
 
 
 
 
0918bea
eb20090
 
0918bea
eb20090
 
 
0918bea
eb20090
0918bea
eb20090
 
 
 
 
0918bea
eb20090
 
0918bea
eb20090
 
 
 
 
 
 
 
 
 
 
 
 
 
0918bea
 
 
 
961b876
0918bea
 
eb20090
0918bea
eb20090
 
0918bea
961b876
0918bea
961b876
0918bea
 
 
 
 
 
eb20090
0918bea
 
 
 
 
 
 
 
 
 
 
961b876
0918bea
961b876
 
0918bea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# pdf_processing.py
import fitz  # PyMuPDF
import pymupdf4llm
import os
import traceback
from typing import Any, Dict, List # Use standard List, Dict
from collections import Counter

def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
    """Converts a fitz.Rect object to a dictionary."""
    if not rect or not isinstance(rect, fitz.Rect):
        # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
        return None
    return {
        "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
        "width": rect.width, "height": rect.height
    }

def try_map_issues_to_page_rects(
    issues_to_map_for_context: List[Dict[str, Any]],
    pdf_rects: List[fitz.Rect],
    page_number_for_mapping: int
) -> int:
    mapped_count = 0
    limit = min(len(issues_to_map_for_context), len(pdf_rects))
    for i in range(limit):
        issue_to_update = issues_to_map_for_context[i]
        if issue_to_update['is_mapped_to_pdf']: continue
        pdf_rect = pdf_rects[i]
        coord_dict = convert_rect_to_dict(pdf_rect)
        if coord_dict:
            issue_to_update['pdf_coordinates_list'] = [coord_dict]
            issue_to_update['is_mapped_to_pdf'] = True
            issue_to_update['mapped_page_number'] = page_number_for_mapping
            mapped_count += 1
    return mapped_count

def extract_font_filtered_markdown(pdf_path: str) -> str:
    """
    Extracts text from PDF at pdf_path, filters by majority font,
    builds a new PDF in memory, and converts it to Markdown using PyMuPDF4LLM.
    Expects pdf_path to be a valid path to a PDF file.
    """
    original_doc = None
    new_doc = None
    try:
        original_doc = fitz.open(pdf_path)
        if not original_doc.page_count:
            print("FontFilter: PDF has no pages.")
            return ""

        all_spans_details: List[Dict[str, Any]] = []
        font_char_counts: Counter = Counter()

        pdf_basename = os.path.basename(pdf_path)
        print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
        for page_num in range(original_doc.page_count):
            page = original_doc[page_num]
            text_dict = page.get_text("dict") 
            for block in text_dict.get("blocks", []):
                if block.get("type") == 0: 
                    for line in block.get("lines", []):
                        for span in line.get("spans", []):
                            font_name = span["font"]
                            font_size_rounded = int(round(span["size"])) 
                            text = span["text"]
                            span_detail = {
                                "text": text, "font_name": font_name,
                                "font_size_rounded": font_size_rounded,
                                "original_font_size": span["size"],
                                "bbox": span["bbox"], "page_num": page_num
                            }
                            all_spans_details.append(span_detail)
                            font_char_counts[(font_name, font_size_rounded)] += len(text)
        
        if not font_char_counts:
            print("FontFilter: No text with font information found in PDF.")
            return ""

        majority_font_tuple_info = font_char_counts.most_common(1)[0]
        (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
        char_count = majority_font_tuple_info[1]
        print(f"FontFilter: Majority font: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count} chars).")

        new_doc = fitz.Document()
        # print("FontFilter: Constructing new PDF with majority font text...") # Can be verbose
        for p_num in range(original_doc.page_count):
            original_page_for_dim = original_doc[p_num]
            new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width, 
                                            height=original_page_for_dim.rect.height)
            spans_to_write = [
                s_detail for s_detail in all_spans_details 
                if s_detail["page_num"] == p_num and \
                   s_detail["font_name"] == majority_font_name and \
                   s_detail["font_size_rounded"] == majority_font_size_rounded
            ]
            for span_data in spans_to_write:
                text_to_insert = span_data["text"]
                original_bbox = fitz.Rect(span_data["bbox"])
                font_size_for_render = span_data["original_font_size"]
                new_pdf_page.insert_textbox(
                    original_bbox, text_to_insert, fontsize=font_size_for_render,
                    fontname="helv", align=0 
                ) # Ignoring insertion_result for brevity here
        
        # print(f"FontFilter: New PDF constructed with {new_doc.page_count} pages.")
        markdown_text = ""
        if new_doc.page_count > 0:
            # print(f"FontFilter: Converting filtered PDF Document object to Markdown...") # Verbose
            markdown_text = pymupdf4llm.to_markdown(new_doc)
        else:
            print("FontFilter: The new PDF (filtered) is empty. No markdown generated.")
        
        # print(f"FontFilter: Markdown from filtered PDF length: {len(markdown_text)} chars.")
        return markdown_text
    except Exception as e:
        print(f"Error in extract_font_filtered_markdown for '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if original_doc: original_doc.close()
        if new_doc: new_doc.close()

def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
    """
    Extracts raw plain text from the PDF at pdf_path without any filtering.
    Expects pdf_path to be a valid path to a PDF file.
    """
    doc_orig_text = None
    try:
        doc_orig_text = fitz.open(pdf_path)
        full_text_parts = [page.get_text("text") for page in doc_orig_text]
        # print(f"OriginalTextExtract: Extracted {len(doc_orig_text.page_count)} pages of plain text from '{os.path.basename(pdf_path)}'.")
        return "".join(full_text_parts)
    except Exception as e:
        print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if doc_orig_text: doc_orig_text.close()