File size: 6,361 Bytes
961b876 0918bea eb20090 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea eb20090 0918bea eb20090 0918bea 961b876 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea eb20090 0918bea 961b876 0918bea eb20090 0918bea eb20090 0918bea 961b876 0918bea 961b876 0918bea eb20090 0918bea 961b876 0918bea 961b876 0918bea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# pdf_processing.py
import fitz # PyMuPDF
import pymupdf4llm
import os
import traceback
from typing import Any, Dict, List # Use standard List, Dict
from collections import Counter
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
"""Converts a fitz.Rect object to a dictionary."""
if not rect or not isinstance(rect, fitz.Rect):
# print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
return None
return {
"x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
"width": rect.width, "height": rect.height
}
def try_map_issues_to_page_rects(
issues_to_map_for_context: List[Dict[str, Any]],
pdf_rects: List[fitz.Rect],
page_number_for_mapping: int
) -> int:
mapped_count = 0
limit = min(len(issues_to_map_for_context), len(pdf_rects))
for i in range(limit):
issue_to_update = issues_to_map_for_context[i]
if issue_to_update['is_mapped_to_pdf']: continue
pdf_rect = pdf_rects[i]
coord_dict = convert_rect_to_dict(pdf_rect)
if coord_dict:
issue_to_update['pdf_coordinates_list'] = [coord_dict]
issue_to_update['is_mapped_to_pdf'] = True
issue_to_update['mapped_page_number'] = page_number_for_mapping
mapped_count += 1
return mapped_count
def extract_font_filtered_markdown(pdf_path: str) -> str:
"""
Extracts text from PDF at pdf_path, filters by majority font,
builds a new PDF in memory, and converts it to Markdown using PyMuPDF4LLM.
Expects pdf_path to be a valid path to a PDF file.
"""
original_doc = None
new_doc = None
try:
original_doc = fitz.open(pdf_path)
if not original_doc.page_count:
print("FontFilter: PDF has no pages.")
return ""
all_spans_details: List[Dict[str, Any]] = []
font_char_counts: Counter = Counter()
pdf_basename = os.path.basename(pdf_path)
print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
for page_num in range(original_doc.page_count):
page = original_doc[page_num]
text_dict = page.get_text("dict")
for block in text_dict.get("blocks", []):
if block.get("type") == 0:
for line in block.get("lines", []):
for span in line.get("spans", []):
font_name = span["font"]
font_size_rounded = int(round(span["size"]))
text = span["text"]
span_detail = {
"text": text, "font_name": font_name,
"font_size_rounded": font_size_rounded,
"original_font_size": span["size"],
"bbox": span["bbox"], "page_num": page_num
}
all_spans_details.append(span_detail)
font_char_counts[(font_name, font_size_rounded)] += len(text)
if not font_char_counts:
print("FontFilter: No text with font information found in PDF.")
return ""
majority_font_tuple_info = font_char_counts.most_common(1)[0]
(majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
char_count = majority_font_tuple_info[1]
print(f"FontFilter: Majority font: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count} chars).")
new_doc = fitz.Document()
# print("FontFilter: Constructing new PDF with majority font text...") # Can be verbose
for p_num in range(original_doc.page_count):
original_page_for_dim = original_doc[p_num]
new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width,
height=original_page_for_dim.rect.height)
spans_to_write = [
s_detail for s_detail in all_spans_details
if s_detail["page_num"] == p_num and \
s_detail["font_name"] == majority_font_name and \
s_detail["font_size_rounded"] == majority_font_size_rounded
]
for span_data in spans_to_write:
text_to_insert = span_data["text"]
original_bbox = fitz.Rect(span_data["bbox"])
font_size_for_render = span_data["original_font_size"]
new_pdf_page.insert_textbox(
original_bbox, text_to_insert, fontsize=font_size_for_render,
fontname="helv", align=0
) # Ignoring insertion_result for brevity here
# print(f"FontFilter: New PDF constructed with {new_doc.page_count} pages.")
markdown_text = ""
if new_doc.page_count > 0:
# print(f"FontFilter: Converting filtered PDF Document object to Markdown...") # Verbose
markdown_text = pymupdf4llm.to_markdown(new_doc)
else:
print("FontFilter: The new PDF (filtered) is empty. No markdown generated.")
# print(f"FontFilter: Markdown from filtered PDF length: {len(markdown_text)} chars.")
return markdown_text
except Exception as e:
print(f"Error in extract_font_filtered_markdown for '{pdf_path}': {e}\n{traceback.format_exc()}")
return ""
finally:
if original_doc: original_doc.close()
if new_doc: new_doc.close()
def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
"""
Extracts raw plain text from the PDF at pdf_path without any filtering.
Expects pdf_path to be a valid path to a PDF file.
"""
doc_orig_text = None
try:
doc_orig_text = fitz.open(pdf_path)
full_text_parts = [page.get_text("text") for page in doc_orig_text]
# print(f"OriginalTextExtract: Extracted {len(doc_orig_text.page_count)} pages of plain text from '{os.path.basename(pdf_path)}'.")
return "".join(full_text_parts)
except Exception as e:
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
return ""
finally:
if doc_orig_text: doc_orig_text.close() |