File size: 7,110 Bytes
d07ab72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# main_analyzer.py
import fitz # PyMuPDF
import os
import tempfile
import re
import traceback
from typing import Tuple, Dict, Any, List, Optional
from collections import defaultdict
# Import functions from our refactored modules
from pdf_processing import extract_pdf_text, try_map_issues_to_page_rects # convert_rect_to_dict is used by try_map_issues
from text_utils import convert_markdown_to_plain_text
from content_analysis import (
check_metadata, check_disclosures, check_figures_and_tables,
check_references_summary, check_structure, check_language_issues_and_regex,
check_figure_order, check_reference_order
)
def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
doc_for_mapping = None
temp_fitz_file_path = None
try:
markdown_text = extract_pdf_text(filepath_or_stream)
if not markdown_text:
return {"error": "Failed to extract text (Markdown) from PDF."}, None
plain_text_for_general_checks = convert_markdown_to_plain_text(markdown_text)
cleaned_plain_text_for_regex = re.sub(r'\s+', ' ', plain_text_for_general_checks.replace('\n', ' ')).strip()
language_and_regex_issue_report = check_language_issues_and_regex(markdown_text)
if "error" in language_and_regex_issue_report:
return {"error": f"Language/Regex check error: {language_and_regex_issue_report['error']}"}, None
detailed_issues_for_mapping = language_and_regex_issue_report.get("issues_list", [])
if detailed_issues_for_mapping:
if isinstance(filepath_or_stream, str):
pdf_path_for_fitz = filepath_or_stream
elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
filepath_or_stream.seek(0)
temp_fitz_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
temp_fitz_file_path = temp_fitz_file.name
temp_fitz_file.write(filepath_or_stream.read())
temp_fitz_file.close()
pdf_path_for_fitz = temp_fitz_file_path
else:
return {"error": "Invalid PDF input for coordinate mapping."}, None
try:
doc_for_mapping = fitz.open(pdf_path_for_fitz)
if doc_for_mapping.page_count > 0:
print(f"\n--- Mapping {len(detailed_issues_for_mapping)} Issues (filtered) to PDF Coordinates ---")
if detailed_issues_for_mapping:
for page_idx in range(doc_for_mapping.page_count):
page = doc_for_mapping[page_idx]
current_page_num_1_based = page_idx + 1
unmapped_issues_on_this_page_by_context = defaultdict(list)
for issue_dict in detailed_issues_for_mapping:
if not issue_dict['is_mapped_to_pdf']:
unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
if not unmapped_issues_on_this_page_by_context:
if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
continue
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
if not ctx_str.strip(): continue
try:
pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
if pdf_rects:
try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
except Exception as search_exc:
print(f"Warning: Error searching for context '{ctx_str[:30]}' on page {current_page_num_1_based}: {search_exc}")
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
print(f"Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
else:
print("No language/regex issues found within the defined content boundaries to map.")
except Exception as e_map:
print(f"Error during PDF coordinate mapping: {e_map}")
traceback.print_exc()
finally:
if doc_for_mapping: doc_for_mapping.close()
if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
os.unlink(temp_fitz_file_path)
final_formatted_issues_list = []
for issue_data in detailed_issues_for_mapping:
page_num_for_json = 0
coords_for_json = []
if issue_data['is_mapped_to_pdf'] and issue_data['pdf_coordinates_list']:
coord_dict = issue_data['pdf_coordinates_list'][0]
coords_for_json = [coord_dict['x0'], coord_dict['y0'], coord_dict['x1'], coord_dict['y1']]
page_num_for_json = issue_data['mapped_page_number']
final_formatted_issues_list.append({
"message": issue_data['message'], "context": issue_data['context_text'],
"suggestions": issue_data['replacements_suggestion'], "category": issue_data['category_name'],
"rule_id": issue_data['ruleId'], "offset": issue_data['offset_in_text'],
"length": issue_data['error_length'], "coordinates": coords_for_json,
"page": page_num_for_json
})
results = {
"issues": final_formatted_issues_list,
"document_checks": {
"metadata": check_metadata(cleaned_plain_text_for_regex),
"disclosures": check_disclosures(cleaned_plain_text_for_regex),
"figures_and_tables": check_figures_and_tables(cleaned_plain_text_for_regex),
"references_summary": check_references_summary(cleaned_plain_text_for_regex),
"structure": check_structure(cleaned_plain_text_for_regex),
"figure_order_analysis": check_figure_order(cleaned_plain_text_for_regex),
"reference_order_analysis": check_reference_order(cleaned_plain_text_for_regex),
"plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_plain_text_for_regex, re.IGNORECASE)),
"readability_issues_detected": False,
}
}
return results, None
except Exception as e:
print(f"Overall analysis error in analyze_pdf: {e}")
traceback.print_exc()
if doc_for_mapping: doc_for_mapping.close()
if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
os.unlink(temp_fitz_file_path)
return {"error": str(e)}, None |