samyak152002 commited on
Commit
961b876
·
verified ·
1 Parent(s): d368b7f

Create pdf_processing.py

Browse files
Files changed (1) hide show
  1. pdf_processing.py +82 -0
pdf_processing.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_processing.py
2
+ import fitz # PyMuPDF
3
+ import pymupdf4llm
4
+ import os
5
+ import tempfile
6
+ import traceback
7
+ from typing import Tuple, Optional, List, Dict, Any
8
+
9
+ def convert_rect_to_dict(rect: fitz.Rect) -> Optional[Dict[str, float]]:
10
+ """Converts a fitz.Rect object to a dictionary."""
11
+ if not rect or not isinstance(rect, fitz.Rect):
12
+ print(f"Warning: Invalid rect object received: {rect}")
13
+ return None
14
+ return {
15
+ "x0": rect.x0,
16
+ "y0": rect.y0,
17
+ "x1": rect.x1,
18
+ "y1": rect.y1,
19
+ "width": rect.width,
20
+ "height": rect.height
21
+ }
22
+
23
+ def try_map_issues_to_page_rects(
24
+ issues_to_map_for_context: List[Dict[str, Any]],
25
+ pdf_rects: List[fitz.Rect],
26
+ page_number_for_mapping: int # 1-based page number
27
+ ) -> int:
28
+ """Helper function for mapping LT issues to PDF rectangles."""
29
+ mapped_count = 0
30
+ num_issues_to_try = len(issues_to_map_for_context)
31
+ num_available_rects = len(pdf_rects)
32
+ limit = min(num_issues_to_try, num_available_rects)
33
+
34
+ for i in range(limit):
35
+ issue_to_update = issues_to_map_for_context[i]
36
+ if issue_to_update['is_mapped_to_pdf']: # Check the correct flag name
37
+ continue
38
+ pdf_rect = pdf_rects[i]
39
+ coord_dict = convert_rect_to_dict(pdf_rect)
40
+ if coord_dict:
41
+ issue_to_update['pdf_coordinates_list'] = [coord_dict] # Store as list of dicts
42
+ issue_to_update['is_mapped_to_pdf'] = True
43
+ issue_to_update['mapped_page_number'] = page_number_for_mapping
44
+ mapped_count += 1
45
+ else:
46
+ print(f" Warning: Could not convert rect for context '{issue_to_update['context_text'][:30]}...' on page {page_number_for_mapping}")
47
+ return mapped_count
48
+
49
+ def extract_pdf_text(file_input: Any) -> str:
50
+ """Extracts full text from a PDF file using PyMuPDF4LLM (as Markdown)."""
51
+ temp_file_path_for_pymupdf4llm = None
52
+ actual_path_to_process = None
53
+ try:
54
+ if isinstance(file_input, str):
55
+ actual_path_to_process = file_input
56
+ elif hasattr(file_input, 'read') and callable(file_input.read):
57
+ temp_file_obj = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
58
+ temp_file_path_for_pymupdf4llm = temp_file_obj.name
59
+ file_input.seek(0)
60
+ temp_file_obj.write(file_input.read())
61
+ temp_file_obj.close()
62
+ actual_path_to_process = temp_file_path_for_pymupdf4llm
63
+ else:
64
+ raise ValueError("Input 'file_input' must be a file path (str) or a file-like object.")
65
+
66
+ doc_for_page_count = fitz.open(actual_path_to_process)
67
+ page_count = len(doc_for_page_count)
68
+ doc_for_page_count.close()
69
+ print(f"PDF has {page_count} pages. Extracting Markdown using pymupdf4llm.")
70
+
71
+ markdown_text = pymupdf4llm.to_markdown(actual_path_to_process)
72
+
73
+ print(f"Total extracted Markdown text length: {len(markdown_text)} characters.")
74
+ return markdown_text
75
+
76
+ except Exception as e:
77
+ print(f"Error extracting text from PDF: {str(e)}")
78
+ traceback.print_exc()
79
+ return ""
80
+ finally:
81
+ if temp_file_path_for_pymupdf4llm and os.path.exists(temp_file_path_for_pymupdf4llm):
82
+ os.remove(temp_file_path_for_pymupdf4llm)