Samyak-Meesho commited on
Commit
2c6cadb
·
1 Parent(s): b690306

changed code

Browse files
Files changed (3) hide show
  1. language_checker.py +38 -6
  2. main_analyzer.py +3 -22
  3. pdf_processing.py +107 -65
language_checker.py CHANGED
@@ -62,25 +62,57 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
62
 
63
  lt_issues_in_range = 0
64
  for idx, match in enumerate(raw_lt_matches):
65
- if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
66
-
67
  if not (content_start_index <= match.offset < content_end_index):
68
  continue
69
- lt_issues_in_range +=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
72
  processed_lt_issues.append({
73
  '_internal_id': f"lt_{idx}",
74
  'ruleId': match.ruleId,
75
  'message': match.message,
76
- 'context_text': context_str,
77
  'offset_in_text': match.offset,
78
  'error_length': match.errorLength,
79
  'replacements_suggestion': match.replacements[:3] if match.replacements else [],
80
  'category_name': match.category,
81
  'source_check_type': 'LanguageTool',
82
  'is_mapped_to_pdf': False,
83
- 'pdf_coordinates_list': [],
84
  'mapped_page_number': -1
85
  })
86
  print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
 
62
 
63
  lt_issues_in_range = 0
64
  for idx, match in enumerate(raw_lt_matches):
65
+ if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
66
+
67
  if not (content_start_index <= match.offset < content_end_index):
68
  continue
69
+ lt_issues_in_range += 1
70
+
71
+ # Current context extraction:
72
+ # context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
73
+
74
+ # New context extraction for ~10 words:
75
+ words_around = 1 # Number of words to try and get on each side
76
+
77
+ # Text before the error
78
+ pre_error_text = text_for_lt_analysis[:match.offset]
79
+ words_before = pre_error_text.split()[-words_around:]
80
+
81
+ # Text of the error itself
82
+ error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
83
+
84
+ # Text after the error
85
+ post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
86
+ words_after = post_error_text.split()[:words_around]
87
+
88
+ # Combine to form the new wider context
89
+ context_parts = []
90
+ if words_before:
91
+ context_parts.append(" ".join(words_before))
92
+ context_parts.append(error_text) # The actual error phrase
93
+ if words_after:
94
+ context_parts.append(" ".join(words_after))
95
+
96
+ wider_context_str = " ".join(context_parts)
97
+ # Ensure there's a small buffer around the error to make it ~10 words total if error is short
98
+ # This can be refined further based on average word length or by counting words more precisely.
99
+ # A simpler approach using character offsets could also be used, e.g.:
100
+ # context_start_char = max(0, match.offset - 50) # Approx 50 chars before
101
+ # context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
102
+ # wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
103
 
 
104
  processed_lt_issues.append({
105
  '_internal_id': f"lt_{idx}",
106
  'ruleId': match.ruleId,
107
  'message': match.message,
108
+ 'context_text': wider_context_str, # Use the new wider context
109
  'offset_in_text': match.offset,
110
  'error_length': match.errorLength,
111
  'replacements_suggestion': match.replacements[:3] if match.replacements else [],
112
  'category_name': match.category,
113
  'source_check_type': 'LanguageTool',
114
  'is_mapped_to_pdf': False,
115
+ 'pdf_coordinates_list': [],
116
  'mapped_page_number': -1
117
  })
118
  print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
main_analyzer.py CHANGED
@@ -8,7 +8,7 @@ from typing import Tuple, Dict, Any, List
8
  from collections import defaultdict
9
 
10
  from pdf_processing import (
11
- extract_font_filtered_markdown,
12
  extract_plain_text_from_original_pdf,
13
  try_map_issues_to_page_rects
14
  )
@@ -29,30 +29,10 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
29
  if isinstance(filepath_or_stream, str):
30
  original_pdf_access_path = filepath_or_stream
31
  print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
32
- # Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
33
- elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
34
- os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
35
- original_pdf_access_path = filepath_or_stream.name
36
- print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
37
- # If this object also has a .read method, it might be a TemporaryFileWrapper.
38
- # The next elif would handle it if we prefer processing it as a stream,
39
- # but using its .name path is usually fine and simpler.
40
- elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
41
- with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
42
- temp_file_for_stream_path = temp_file_obj.name
43
- if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
44
- filepath_or_stream.seek(0)
45
- temp_file_obj.write(filepath_or_stream.read())
46
- original_pdf_access_path = temp_file_for_stream_path
47
- print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
48
- else:
49
- return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
50
 
51
  if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
52
  return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
53
 
54
- # --- The rest of the function remains the same as the previous complete listing ---
55
- # 1. Unfiltered Plain Text (for general and regex checks)
56
  print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
57
  raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
58
 
@@ -64,7 +44,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
64
 
65
  # 2. Font-Filtered Markdown (for LanguageTool checks)
66
  print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
67
- markdown_text_from_filtered_pdf = extract_font_filtered_markdown(original_pdf_access_path)
 
68
  if not markdown_text_from_filtered_pdf and pdf_size > 0 :
69
  print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
70
 
 
8
  from collections import defaultdict
9
 
10
  from pdf_processing import (
11
+ extract_majority_font_text_directly,
12
  extract_plain_text_from_original_pdf,
13
  try_map_issues_to_page_rects
14
  )
 
29
  if isinstance(filepath_or_stream, str):
30
  original_pdf_access_path = filepath_or_stream
31
  print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
34
  return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
35
 
 
 
36
  print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
37
  raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
38
 
 
44
 
45
  # 2. Font-Filtered Markdown (for LanguageTool checks)
46
  print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
47
+ markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
48
+ print("markdown font print kar raha hun", markdown_text_from_filtered_pdf)
49
  if not markdown_text_from_filtered_pdf and pdf_size > 0 :
50
  print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
51
 
pdf_processing.py CHANGED
@@ -35,90 +35,132 @@ def try_map_issues_to_page_rects(
35
  mapped_count += 1
36
  return mapped_count
37
 
38
- def extract_font_filtered_markdown(pdf_path: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  """
40
- Extracts text from PDF at pdf_path, filters by majority font,
41
- builds a new PDF in memory, and converts it to Markdown using PyMuPDF4LLM.
42
- Expects pdf_path to be a valid path to a PDF file.
 
43
  """
44
  original_doc = None
45
- new_doc = None
46
  try:
 
47
  original_doc = fitz.open(pdf_path)
48
  if not original_doc.page_count:
49
- print("FontFilter: PDF has no pages.")
50
  return ""
51
 
52
- all_spans_details: List[Dict[str, Any]] = []
53
  font_char_counts: Counter = Counter()
54
-
55
  pdf_basename = os.path.basename(pdf_path)
56
- print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
57
- for page_num in range(original_doc.page_count):
58
- page = original_doc[page_num]
59
- text_dict = page.get_text("dict")
60
- for block in text_dict.get("blocks", []):
61
- if block.get("type") == 0:
62
- for line in block.get("lines", []):
63
- for span in line.get("spans", []):
64
- font_name = span["font"]
65
- font_size_rounded = int(round(span["size"]))
66
- text = span["text"]
67
- span_detail = {
68
- "text": text, "font_name": font_name,
69
- "font_size_rounded": font_size_rounded,
70
- "original_font_size": span["size"],
71
- "bbox": span["bbox"], "page_num": page_num
72
- }
73
- all_spans_details.append(span_detail)
 
74
  font_char_counts[(font_name, font_size_rounded)] += len(text)
75
-
76
  if not font_char_counts:
77
- print("FontFilter: No text with font information found in PDF.")
78
  return ""
79
 
80
  majority_font_tuple_info = font_char_counts.most_common(1)[0]
81
  (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
82
- char_count = majority_font_tuple_info[1]
83
- print(f"FontFilter: Majority font: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count} chars).")
84
-
85
- new_doc = fitz.Document()
86
- # print("FontFilter: Constructing new PDF with majority font text...") # Can be verbose
87
- for p_num in range(original_doc.page_count):
88
- original_page_for_dim = original_doc[p_num]
89
- new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width,
90
- height=original_page_for_dim.rect.height)
91
- spans_to_write = [
92
- s_detail for s_detail in all_spans_details
93
- if s_detail["page_num"] == p_num and \
94
- s_detail["font_name"] == majority_font_name and \
95
- s_detail["font_size_rounded"] == majority_font_size_rounded
96
- ]
97
- for span_data in spans_to_write:
98
- text_to_insert = span_data["text"]
99
- original_bbox = fitz.Rect(span_data["bbox"])
100
- font_size_for_render = span_data["original_font_size"]
101
- new_pdf_page.insert_textbox(
102
- original_bbox, text_to_insert, fontsize=font_size_for_render,
103
- fontname="helv", align=0
104
- ) # Ignoring insertion_result for brevity here
105
-
106
- # print(f"FontFilter: New PDF constructed with {new_doc.page_count} pages.")
107
- markdown_text = ""
108
- if new_doc.page_count > 0:
109
- # print(f"FontFilter: Converting filtered PDF Document object to Markdown...") # Verbose
110
- markdown_text = pymupdf4llm.to_markdown(new_doc)
111
- else:
112
- print("FontFilter: The new PDF (filtered) is empty. No markdown generated.")
113
-
114
- # print(f"FontFilter: Markdown from filtered PDF length: {len(markdown_text)} chars.")
115
- return markdown_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  except Exception as e:
117
- print(f"Error in extract_font_filtered_markdown for '{pdf_path}': {e}\n{traceback.format_exc()}")
118
  return ""
119
  finally:
120
  if original_doc: original_doc.close()
121
- if new_doc: new_doc.close()
122
 
123
  def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
124
  """
@@ -129,7 +171,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
129
  try:
130
  doc_orig_text = fitz.open(pdf_path)
131
  full_text_parts = [page.get_text("text") for page in doc_orig_text]
132
- # print(f"OriginalTextExtract: Extracted {len(doc_orig_text.page_count)} pages of plain text from '{os.path.basename(pdf_path)}'.")
133
  return "".join(full_text_parts)
134
  except Exception as e:
135
  print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
 
35
  mapped_count += 1
36
  return mapped_count
37
 
38
+
39
+ import fitz # PyMuPDF
40
+ import os
41
+ import traceback
42
+ from typing import Any, Dict, List
43
+ from collections import Counter
44
+
45
+
46
+ # Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
47
+
48
+ import fitz # PyMuPDF
49
+ import os
50
+ import traceback
51
+ from typing import Any, Dict, List # Use standard List, Dict
52
+ from collections import Counter
53
+
54
+
55
+ # Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
56
+
57
+ def extract_majority_font_text_directly(pdf_path: str) -> str:
58
  """
59
+ Extracts text from PDF, identifies the majority font and size,
60
+ and then directly assembles a plain text string containing only the text
61
+ that matches this majority font, attempting to preserve basic structure.
62
+ This method does NOT create an intermediate PDF document.
63
  """
64
  original_doc = None
 
65
  try:
66
+ # 1. Open PDF and Perform Font Analysis (similar to before)
67
  original_doc = fitz.open(pdf_path)
68
  if not original_doc.page_count:
69
+ print("FontFilter (Direct): PDF has no pages.")
70
  return ""
71
 
 
72
  font_char_counts: Counter = Counter()
 
73
  pdf_basename = os.path.basename(pdf_path)
74
+ print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
75
+
76
+ # First pass: Analyze fonts to find the majority
77
+ for page_num_analysis in range(original_doc.page_count):
78
+ page_analysis = original_doc[page_num_analysis]
79
+ # Using TEXTFLAGS_TEXT for potentially cleaner text from spans
80
+ text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
81
+ for block_analysis in text_dict_analysis.get("blocks", []):
82
+ if block_analysis.get("type") == 0: # type 0 is a text block
83
+ for line_analysis in block_analysis.get("lines", []):
84
+ for span_analysis in line_analysis.get("spans", []):
85
+ font_name = span_analysis["font"]
86
+ font_size = span_analysis.get("size")
87
+ if font_size is None: continue # Skip if size is not available
88
+
89
+ font_size_rounded = int(round(font_size))
90
+ text = span_analysis["text"]
91
+ if not text.strip(): continue # Skip purely whitespace spans
92
+
93
  font_char_counts[(font_name, font_size_rounded)] += len(text)
94
+
95
  if not font_char_counts:
96
+ print("FontFilter (Direct): No text with font information found in PDF.")
97
  return ""
98
 
99
  majority_font_tuple_info = font_char_counts.most_common(1)[0]
100
  (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
101
+ char_count_for_majority = majority_font_tuple_info[1]
102
+ print(
103
+ f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")
104
+
105
+ # 2. Second Pass: Extract and Assemble Text Based on Majority Font
106
+ print(
107
+ f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
108
+ all_pages_collected_text = [] # List to hold text from each page (as a list of block texts)
109
+
110
+ for page_num_extraction in range(original_doc.page_count):
111
+ page = original_doc[page_num_extraction]
112
+ # Using flags for potentially better whitespace and ligature handling in extracted text
113
+ text_page_dict = page.get_text("dict",
114
+ flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
115
+
116
+ page_blocks_text_parts = [] # Collect text from blocks on this page
117
+
118
+ for block in text_page_dict.get("blocks", []):
119
+ if block.get("type") == 0: # Text block
120
+ current_block_lines_text_parts = []
121
+ for line in block.get("lines", []):
122
+ current_line_spans_text_parts = []
123
+ for span in line.get("spans", []):
124
+ # Check if this span matches the majority font
125
+ current_span_font_name = span["font"]
126
+ current_span_font_size = span.get("size")
127
+
128
+ if current_span_font_size is not None and \
129
+ current_span_font_name == majority_font_name and \
130
+ int(round(current_span_font_size)) == majority_font_size_rounded:
131
+ current_line_spans_text_parts.append(span["text"])
132
+
133
+ if current_line_spans_text_parts:
134
+ # Join text from selected spans within a line with a single space
135
+ line_text = " ".join(current_line_spans_text_parts)
136
+ current_block_lines_text_parts.append(line_text)
137
+
138
+ if current_block_lines_text_parts:
139
+ # Join lines within a block with a single newline
140
+ block_text = "\n".join(current_block_lines_text_parts)
141
+ page_blocks_text_parts.append(block_text)
142
+
143
+ if page_blocks_text_parts:
144
+ # Join blocks on a page with a double newline (simulating paragraph breaks)
145
+ all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))
146
+
147
+ if not all_pages_collected_text:
148
+ print("FontFilter (Direct): No text matching the majority font was found to extract.")
149
+ return ""
150
+
151
+ # Join text from all pages.
152
+ # A page break is already handled by the \n\n between blocks of different pages.
153
+ # If more distinct page separation is needed, a custom separator could be added here.
154
+ final_text = "\n\n".join(all_pages_collected_text)
155
+ print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
156
+ return final_text
157
+
158
  except Exception as e:
159
+ print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
160
  return ""
161
  finally:
162
  if original_doc: original_doc.close()
163
+
164
 
165
  def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
166
  """
 
171
  try:
172
  doc_orig_text = fitz.open(pdf_path)
173
  full_text_parts = [page.get_text("text") for page in doc_orig_text]
174
+ print(full_text_parts)
175
  return "".join(full_text_parts)
176
  except Exception as e:
177
  print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")