File size: 8,288 Bytes
d07ab72 070b77e d07ab72 070b77e 2c6cadb 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 b690306 070b77e b690306 070b77e b690306 d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e 2c6cadb 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e b690306 070b77e d07ab72 070b77e d07ab72 070b77e d07ab72 070b77e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# main_analyzer.py
import fitz # PyMuPDF
import os
import tempfile
import re
import traceback
from typing import Tuple, Dict, Any, List
from collections import defaultdict
from pdf_processing import (
extract_majority_font_text_directly,
extract_plain_text_from_original_pdf,
try_map_issues_to_page_rects
)
from content_analysis import (
check_metadata, check_disclosures, check_figures_and_tables,
check_references_summary, check_structure,
check_figure_order, check_reference_order
)
from language_checker import perform_language_checks
from regex_checker import perform_regex_checks
def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
original_pdf_access_path = None
temp_file_for_stream_path = None
doc_for_mapping = None
try:
if isinstance(filepath_or_stream, str):
original_pdf_access_path = filepath_or_stream
print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
pdf_size = os.path.getsize(original_pdf_access_path)
if not raw_unfiltered_plain_text and pdf_size > 0 :
print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
# 2. Font-Filtered Markdown (for LanguageTool checks)
print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
print("markdown font print kar raha hun", markdown_text_from_filtered_pdf)
if not markdown_text_from_filtered_pdf and pdf_size > 0 :
print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
# 3. Perform all checks
document_check_results = {
"metadata": check_metadata(cleaned_unfiltered_plain_text),
"disclosures": check_disclosures(cleaned_unfiltered_plain_text),
"figures_and_tables": check_figures_and_tables(cleaned_unfiltered_plain_text),
"references_summary": check_references_summary(cleaned_unfiltered_plain_text),
"structure": check_structure(cleaned_unfiltered_plain_text),
"figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
"reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
"plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
"readability_issues_detected": False,
}
print("Analyzer: Performing regex checks...")
regex_report = perform_regex_checks(cleaned_unfiltered_plain_text)
if "error" in regex_report: print(f"Analyzer: Error in regex checks: {regex_report['error']}")
regex_issues = regex_report.get("issues_list", [])
print("Analyzer: Performing language checks...")
lt_report = perform_language_checks(markdown_text_from_filtered_pdf)
if "error" in lt_report: print(f"Analyzer: Error in LanguageTool checks: {lt_report['error']}")
lt_issues = lt_report.get("issues_list", [])
detailed_issues_for_mapping = regex_issues + lt_issues
# 4. Coordinate Mapping (against the original PDF)
if detailed_issues_for_mapping:
try:
doc_for_mapping = fitz.open(original_pdf_access_path)
if doc_for_mapping.page_count > 0:
print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
for page_idx in range(doc_for_mapping.page_count):
page = doc_for_mapping[page_idx]
current_page_num_1_based = page_idx + 1
unmapped_issues_on_this_page_by_context = defaultdict(list)
for issue_dict in detailed_issues_for_mapping:
if not issue_dict['is_mapped_to_pdf']:
unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
if not unmapped_issues_on_this_page_by_context:
if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
continue
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
if not ctx_str or not ctx_str.strip(): continue
try:
pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
if pdf_rects:
try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
except Exception as search_exc:
print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
except Exception as e_map:
print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
finally:
if doc_for_mapping: doc_for_mapping.close()
else:
print("Analyzer: No detailed issues from regex or language checks to map.")
# 5. Format final list of issues
final_formatted_issues_list = []
for issue_data in detailed_issues_for_mapping:
coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
coords_for_json = [c for c in coords_for_json if c is not None]
final_formatted_issues_list.append({
"message": issue_data.get('message', 'N/A'),
"context": issue_data.get('context_text', 'N/A'),
"suggestions": issue_data.get('replacements_suggestion', []),
"category": issue_data.get('category_name', 'Unknown'),
"rule_id": issue_data.get('ruleId', 'N/A'),
"offset": issue_data.get('offset_in_text', -1),
"length": issue_data.get('error_length', 0),
"coordinates": coords_for_json if len(coords_for_json) == 4 else [],
"page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
"source_check_type": issue_data.get('source_check_type', 'N/A')
})
results = {
"issues": final_formatted_issues_list,
"document_checks": document_check_results
}
return results, None
except Exception as e:
print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
return {"error": f"Overall analysis error: {str(e)}"}, None
finally:
if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
try:
os.remove(temp_file_for_stream_path)
print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
except Exception as e_clean:
print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}") |