Update main_analyzer.py
Browse files- main_analyzer.py +17 -10
main_analyzer.py
CHANGED
@@ -19,8 +19,6 @@ from content_analysis import (
|
|
19 |
)
|
20 |
from language_checker import perform_language_checks
|
21 |
from regex_checker import perform_regex_checks
|
22 |
-
# text_utils.convert_markdown_to_plain_text is used by language_checker
|
23 |
-
# config.py is imported in app.py
|
24 |
|
25 |
def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
26 |
original_pdf_access_path = None
|
@@ -28,21 +26,32 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
28 |
doc_for_mapping = None
|
29 |
|
30 |
try:
|
31 |
-
if isinstance(filepath_or_stream, str):
|
32 |
original_pdf_access_path = filepath_or_stream
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
|
34 |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
|
35 |
temp_file_for_stream_path = temp_file_obj.name
|
36 |
-
filepath_or_stream.seek
|
|
|
37 |
temp_file_obj.write(filepath_or_stream.read())
|
38 |
original_pdf_access_path = temp_file_for_stream_path
|
39 |
-
print(f"Analyzer:
|
40 |
else:
|
41 |
-
return {"error": "Invalid PDF input type. Must be path or file-like object."}, None
|
42 |
|
43 |
if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
|
44 |
-
return {"error": f"PDF path '{original_pdf_access_path}' does not exist or is invalid."}, None
|
45 |
|
|
|
46 |
# 1. Unfiltered Plain Text (for general and regex checks)
|
47 |
print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
|
48 |
raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
|
@@ -124,10 +133,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
124 |
for issue_data in detailed_issues_for_mapping:
|
125 |
coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
|
126 |
coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
|
127 |
-
# Filter out None coordinates that might arise from empty coords dict
|
128 |
coords_for_json = [c for c in coords_for_json if c is not None]
|
129 |
|
130 |
-
|
131 |
final_formatted_issues_list.append({
|
132 |
"message": issue_data.get('message', 'N/A'),
|
133 |
"context": issue_data.get('context_text', 'N/A'),
|
@@ -136,7 +143,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
|
136 |
"rule_id": issue_data.get('ruleId', 'N/A'),
|
137 |
"offset": issue_data.get('offset_in_text', -1),
|
138 |
"length": issue_data.get('error_length', 0),
|
139 |
-
"coordinates": coords_for_json if len(coords_for_json) == 4 else [],
|
140 |
"page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
|
141 |
"source_check_type": issue_data.get('source_check_type', 'N/A')
|
142 |
})
|
|
|
19 |
)
|
20 |
from language_checker import perform_language_checks
|
21 |
from regex_checker import perform_regex_checks
|
|
|
|
|
22 |
|
23 |
def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
|
24 |
original_pdf_access_path = None
|
|
|
26 |
doc_for_mapping = None
|
27 |
|
28 |
try:
|
29 |
+
if isinstance(filepath_or_stream, str):
|
30 |
original_pdf_access_path = filepath_or_stream
|
31 |
+
print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
|
32 |
+
# Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
|
33 |
+
elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
|
34 |
+
os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
|
35 |
+
original_pdf_access_path = filepath_or_stream.name
|
36 |
+
print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
|
37 |
+
# If this object also has a .read method, it might be a TemporaryFileWrapper.
|
38 |
+
# The next elif would handle it if we prefer processing it as a stream,
|
39 |
+
# but using its .name path is usually fine and simpler.
|
40 |
elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
|
41 |
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
|
42 |
temp_file_for_stream_path = temp_file_obj.name
|
43 |
+
if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
|
44 |
+
filepath_or_stream.seek(0)
|
45 |
temp_file_obj.write(filepath_or_stream.read())
|
46 |
original_pdf_access_path = temp_file_for_stream_path
|
47 |
+
print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
|
48 |
else:
|
49 |
+
return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
|
50 |
|
51 |
if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
|
52 |
+
return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
|
53 |
|
54 |
+
# --- The rest of the function remains the same as the previous complete listing ---
|
55 |
# 1. Unfiltered Plain Text (for general and regex checks)
|
56 |
print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
|
57 |
raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
|
|
|
133 |
for issue_data in detailed_issues_for_mapping:
|
134 |
coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
|
135 |
coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
|
|
|
136 |
coords_for_json = [c for c in coords_for_json if c is not None]
|
137 |
|
|
|
138 |
final_formatted_issues_list.append({
|
139 |
"message": issue_data.get('message', 'N/A'),
|
140 |
"context": issue_data.get('context_text', 'N/A'),
|
|
|
143 |
"rule_id": issue_data.get('ruleId', 'N/A'),
|
144 |
"offset": issue_data.get('offset_in_text', -1),
|
145 |
"length": issue_data.get('error_length', 0),
|
146 |
+
"coordinates": coords_for_json if len(coords_for_json) == 4 else [],
|
147 |
"page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
|
148 |
"source_check_type": issue_data.get('source_check_type', 'N/A')
|
149 |
})
|