Samyak-Meesho
commited on
Commit
·
2c6cadb
1
Parent(s):
b690306
changed code
Browse files- language_checker.py +38 -6
- main_analyzer.py +3 -22
- pdf_processing.py +107 -65
language_checker.py
CHANGED
@@ -62,25 +62,57 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
62 |
|
63 |
lt_issues_in_range = 0
|
64 |
for idx, match in enumerate(raw_lt_matches):
|
65 |
-
if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue
|
66 |
-
|
67 |
if not (content_start_index <= match.offset < content_end_index):
|
68 |
continue
|
69 |
-
lt_issues_in_range +=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
|
72 |
processed_lt_issues.append({
|
73 |
'_internal_id': f"lt_{idx}",
|
74 |
'ruleId': match.ruleId,
|
75 |
'message': match.message,
|
76 |
-
'context_text':
|
77 |
'offset_in_text': match.offset,
|
78 |
'error_length': match.errorLength,
|
79 |
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
|
80 |
'category_name': match.category,
|
81 |
'source_check_type': 'LanguageTool',
|
82 |
'is_mapped_to_pdf': False,
|
83 |
-
'pdf_coordinates_list': [],
|
84 |
'mapped_page_number': -1
|
85 |
})
|
86 |
print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
|
|
|
62 |
|
63 |
lt_issues_in_range = 0
|
64 |
for idx, match in enumerate(raw_lt_matches):
|
65 |
+
if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
|
66 |
+
|
67 |
if not (content_start_index <= match.offset < content_end_index):
|
68 |
continue
|
69 |
+
lt_issues_in_range += 1
|
70 |
+
|
71 |
+
# Current context extraction:
|
72 |
+
# context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
|
73 |
+
|
74 |
+
# New context extraction for ~10 words:
|
75 |
+
words_around = 1 # Number of words to try and get on each side
|
76 |
+
|
77 |
+
# Text before the error
|
78 |
+
pre_error_text = text_for_lt_analysis[:match.offset]
|
79 |
+
words_before = pre_error_text.split()[-words_around:]
|
80 |
+
|
81 |
+
# Text of the error itself
|
82 |
+
error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
|
83 |
+
|
84 |
+
# Text after the error
|
85 |
+
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
|
86 |
+
words_after = post_error_text.split()[:words_around]
|
87 |
+
|
88 |
+
# Combine to form the new wider context
|
89 |
+
context_parts = []
|
90 |
+
if words_before:
|
91 |
+
context_parts.append(" ".join(words_before))
|
92 |
+
context_parts.append(error_text) # The actual error phrase
|
93 |
+
if words_after:
|
94 |
+
context_parts.append(" ".join(words_after))
|
95 |
+
|
96 |
+
wider_context_str = " ".join(context_parts)
|
97 |
+
# Ensure there's a small buffer around the error to make it ~10 words total if error is short
|
98 |
+
# This can be refined further based on average word length or by counting words more precisely.
|
99 |
+
# A simpler approach using character offsets could also be used, e.g.:
|
100 |
+
# context_start_char = max(0, match.offset - 50) # Approx 50 chars before
|
101 |
+
# context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
|
102 |
+
# wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
|
103 |
|
|
|
104 |
processed_lt_issues.append({
|
105 |
'_internal_id': f"lt_{idx}",
|
106 |
'ruleId': match.ruleId,
|
107 |
'message': match.message,
|
108 |
+
'context_text': wider_context_str, # Use the new wider context
|
109 |
'offset_in_text': match.offset,
|
110 |
'error_length': match.errorLength,
|
111 |
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
|
112 |
'category_name': match.category,
|
113 |
'source_check_type': 'LanguageTool',
|
114 |
'is_mapped_to_pdf': False,
|
115 |
+
'pdf_coordinates_list': [],
|
116 |
'mapped_page_number': -1
|
117 |
})
|
118 |
print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
|
main_analyzer.py
CHANGED
@@ -8,7 +8,7 @@ from typing import Tuple, Dict, Any, List
|
|
8 |
from collections import defaultdict
|
9 |
|
10 |
from pdf_processing import (
|
11 |
-
|
12 |
extract_plain_text_from_original_pdf,
|
13 |
try_map_issues_to_page_rects
|
14 |
)
|
@@ -29,30 +29,10 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
29 |
if isinstance(filepath_or_stream, str):
|
30 |
original_pdf_access_path = filepath_or_stream
|
31 |
print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
|
32 |
-
# Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
|
33 |
-
elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
|
34 |
-
os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
|
35 |
-
original_pdf_access_path = filepath_or_stream.name
|
36 |
-
print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
|
37 |
-
# If this object also has a .read method, it might be a TemporaryFileWrapper.
|
38 |
-
# The next elif would handle it if we prefer processing it as a stream,
|
39 |
-
# but using its .name path is usually fine and simpler.
|
40 |
-
elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
|
41 |
-
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
|
42 |
-
temp_file_for_stream_path = temp_file_obj.name
|
43 |
-
if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
|
44 |
-
filepath_or_stream.seek(0)
|
45 |
-
temp_file_obj.write(filepath_or_stream.read())
|
46 |
-
original_pdf_access_path = temp_file_for_stream_path
|
47 |
-
print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
|
48 |
-
else:
|
49 |
-
return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
|
50 |
|
51 |
if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
|
52 |
return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
|
53 |
|
54 |
-
# --- The rest of the function remains the same as the previous complete listing ---
|
55 |
-
# 1. Unfiltered Plain Text (for general and regex checks)
|
56 |
print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
|
57 |
raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
|
58 |
|
@@ -64,7 +44,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
64 |
|
65 |
# 2. Font-Filtered Markdown (for LanguageTool checks)
|
66 |
print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
|
67 |
-
markdown_text_from_filtered_pdf =
|
|
|
68 |
if not markdown_text_from_filtered_pdf and pdf_size > 0 :
|
69 |
print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
|
70 |
|
|
|
8 |
from collections import defaultdict
|
9 |
|
10 |
from pdf_processing import (
|
11 |
+
extract_majority_font_text_directly,
|
12 |
extract_plain_text_from_original_pdf,
|
13 |
try_map_issues_to_page_rects
|
14 |
)
|
|
|
29 |
if isinstance(filepath_or_stream, str):
|
30 |
original_pdf_access_path = filepath_or_stream
|
31 |
print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
|
34 |
return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
|
35 |
|
|
|
|
|
36 |
print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
|
37 |
raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
|
38 |
|
|
|
44 |
|
45 |
# 2. Font-Filtered Markdown (for LanguageTool checks)
|
46 |
print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
|
47 |
+
markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
|
48 |
+
print("markdown font print kar raha hun", markdown_text_from_filtered_pdf)
|
49 |
if not markdown_text_from_filtered_pdf and pdf_size > 0 :
|
50 |
print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
|
51 |
|
pdf_processing.py
CHANGED
@@ -35,90 +35,132 @@ def try_map_issues_to_page_rects(
|
|
35 |
mapped_count += 1
|
36 |
return mapped_count
|
37 |
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
"""
|
40 |
-
Extracts text from PDF
|
41 |
-
|
42 |
-
|
|
|
43 |
"""
|
44 |
original_doc = None
|
45 |
-
new_doc = None
|
46 |
try:
|
|
|
47 |
original_doc = fitz.open(pdf_path)
|
48 |
if not original_doc.page_count:
|
49 |
-
print("FontFilter: PDF has no pages.")
|
50 |
return ""
|
51 |
|
52 |
-
all_spans_details: List[Dict[str, Any]] = []
|
53 |
font_char_counts: Counter = Counter()
|
54 |
-
|
55 |
pdf_basename = os.path.basename(pdf_path)
|
56 |
-
print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
74 |
font_char_counts[(font_name, font_size_rounded)] += len(text)
|
75 |
-
|
76 |
if not font_char_counts:
|
77 |
-
print("FontFilter: No text with font information found in PDF.")
|
78 |
return ""
|
79 |
|
80 |
majority_font_tuple_info = font_char_counts.most_common(1)[0]
|
81 |
(majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
|
82 |
-
|
83 |
-
print(
|
84 |
-
|
85 |
-
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
except Exception as e:
|
117 |
-
print(f"Error in
|
118 |
return ""
|
119 |
finally:
|
120 |
if original_doc: original_doc.close()
|
121 |
-
|
122 |
|
123 |
def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
|
124 |
"""
|
@@ -129,7 +171,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
|
|
129 |
try:
|
130 |
doc_orig_text = fitz.open(pdf_path)
|
131 |
full_text_parts = [page.get_text("text") for page in doc_orig_text]
|
132 |
-
|
133 |
return "".join(full_text_parts)
|
134 |
except Exception as e:
|
135 |
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
|
|
|
35 |
mapped_count += 1
|
36 |
return mapped_count
|
37 |
|
38 |
+
|
39 |
+
import fitz # PyMuPDF
|
40 |
+
import os
|
41 |
+
import traceback
|
42 |
+
from typing import Any, Dict, List
|
43 |
+
from collections import Counter
|
44 |
+
|
45 |
+
|
46 |
+
# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
|
47 |
+
|
48 |
+
import fitz # PyMuPDF
|
49 |
+
import os
|
50 |
+
import traceback
|
51 |
+
from typing import Any, Dict, List # Use standard List, Dict
|
52 |
+
from collections import Counter
|
53 |
+
|
54 |
+
|
55 |
+
# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
|
56 |
+
|
57 |
+
def extract_majority_font_text_directly(pdf_path: str) -> str:
|
58 |
"""
|
59 |
+
Extracts text from PDF, identifies the majority font and size,
|
60 |
+
and then directly assembles a plain text string containing only the text
|
61 |
+
that matches this majority font, attempting to preserve basic structure.
|
62 |
+
This method does NOT create an intermediate PDF document.
|
63 |
"""
|
64 |
original_doc = None
|
|
|
65 |
try:
|
66 |
+
# 1. Open PDF and Perform Font Analysis (similar to before)
|
67 |
original_doc = fitz.open(pdf_path)
|
68 |
if not original_doc.page_count:
|
69 |
+
print("FontFilter (Direct): PDF has no pages.")
|
70 |
return ""
|
71 |
|
|
|
72 |
font_char_counts: Counter = Counter()
|
|
|
73 |
pdf_basename = os.path.basename(pdf_path)
|
74 |
+
print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
|
75 |
+
|
76 |
+
# First pass: Analyze fonts to find the majority
|
77 |
+
for page_num_analysis in range(original_doc.page_count):
|
78 |
+
page_analysis = original_doc[page_num_analysis]
|
79 |
+
# Using TEXTFLAGS_TEXT for potentially cleaner text from spans
|
80 |
+
text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
|
81 |
+
for block_analysis in text_dict_analysis.get("blocks", []):
|
82 |
+
if block_analysis.get("type") == 0: # type 0 is a text block
|
83 |
+
for line_analysis in block_analysis.get("lines", []):
|
84 |
+
for span_analysis in line_analysis.get("spans", []):
|
85 |
+
font_name = span_analysis["font"]
|
86 |
+
font_size = span_analysis.get("size")
|
87 |
+
if font_size is None: continue # Skip if size is not available
|
88 |
+
|
89 |
+
font_size_rounded = int(round(font_size))
|
90 |
+
text = span_analysis["text"]
|
91 |
+
if not text.strip(): continue # Skip purely whitespace spans
|
92 |
+
|
93 |
font_char_counts[(font_name, font_size_rounded)] += len(text)
|
94 |
+
|
95 |
if not font_char_counts:
|
96 |
+
print("FontFilter (Direct): No text with font information found in PDF.")
|
97 |
return ""
|
98 |
|
99 |
majority_font_tuple_info = font_char_counts.most_common(1)[0]
|
100 |
(majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
|
101 |
+
char_count_for_majority = majority_font_tuple_info[1]
|
102 |
+
print(
|
103 |
+
f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")
|
104 |
+
|
105 |
+
# 2. Second Pass: Extract and Assemble Text Based on Majority Font
|
106 |
+
print(
|
107 |
+
f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
|
108 |
+
all_pages_collected_text = [] # List to hold text from each page (as a list of block texts)
|
109 |
+
|
110 |
+
for page_num_extraction in range(original_doc.page_count):
|
111 |
+
page = original_doc[page_num_extraction]
|
112 |
+
# Using flags for potentially better whitespace and ligature handling in extracted text
|
113 |
+
text_page_dict = page.get_text("dict",
|
114 |
+
flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
|
115 |
+
|
116 |
+
page_blocks_text_parts = [] # Collect text from blocks on this page
|
117 |
+
|
118 |
+
for block in text_page_dict.get("blocks", []):
|
119 |
+
if block.get("type") == 0: # Text block
|
120 |
+
current_block_lines_text_parts = []
|
121 |
+
for line in block.get("lines", []):
|
122 |
+
current_line_spans_text_parts = []
|
123 |
+
for span in line.get("spans", []):
|
124 |
+
# Check if this span matches the majority font
|
125 |
+
current_span_font_name = span["font"]
|
126 |
+
current_span_font_size = span.get("size")
|
127 |
+
|
128 |
+
if current_span_font_size is not None and \
|
129 |
+
current_span_font_name == majority_font_name and \
|
130 |
+
int(round(current_span_font_size)) == majority_font_size_rounded:
|
131 |
+
current_line_spans_text_parts.append(span["text"])
|
132 |
+
|
133 |
+
if current_line_spans_text_parts:
|
134 |
+
# Join text from selected spans within a line with a single space
|
135 |
+
line_text = " ".join(current_line_spans_text_parts)
|
136 |
+
current_block_lines_text_parts.append(line_text)
|
137 |
+
|
138 |
+
if current_block_lines_text_parts:
|
139 |
+
# Join lines within a block with a single newline
|
140 |
+
block_text = "\n".join(current_block_lines_text_parts)
|
141 |
+
page_blocks_text_parts.append(block_text)
|
142 |
+
|
143 |
+
if page_blocks_text_parts:
|
144 |
+
# Join blocks on a page with a double newline (simulating paragraph breaks)
|
145 |
+
all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))
|
146 |
+
|
147 |
+
if not all_pages_collected_text:
|
148 |
+
print("FontFilter (Direct): No text matching the majority font was found to extract.")
|
149 |
+
return ""
|
150 |
+
|
151 |
+
# Join text from all pages.
|
152 |
+
# A page break is already handled by the \n\n between blocks of different pages.
|
153 |
+
# If more distinct page separation is needed, a custom separator could be added here.
|
154 |
+
final_text = "\n\n".join(all_pages_collected_text)
|
155 |
+
print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
|
156 |
+
return final_text
|
157 |
+
|
158 |
except Exception as e:
|
159 |
+
print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
|
160 |
return ""
|
161 |
finally:
|
162 |
if original_doc: original_doc.close()
|
163 |
+
|
164 |
|
165 |
def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
|
166 |
"""
|
|
|
171 |
try:
|
172 |
doc_orig_text = fitz.open(pdf_path)
|
173 |
full_text_parts = [page.get_text("text") for page in doc_orig_text]
|
174 |
+
print(full_text_parts)
|
175 |
return "".join(full_text_parts)
|
176 |
except Exception as e:
|
177 |
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
|