File size: 7,110 Bytes
d07ab72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# main_analyzer.py
import fitz  # PyMuPDF
import os
import tempfile
import re
import traceback
from typing import Tuple, Dict, Any, List, Optional
from collections import defaultdict

# Import functions from our refactored modules
from pdf_processing import extract_pdf_text, try_map_issues_to_page_rects # convert_rect_to_dict is used by try_map_issues
from text_utils import convert_markdown_to_plain_text
from content_analysis import (
    check_metadata, check_disclosures, check_figures_and_tables,
    check_references_summary, check_structure, check_language_issues_and_regex,
    check_figure_order, check_reference_order
)


def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
    doc_for_mapping = None
    temp_fitz_file_path = None 

    try:
        markdown_text = extract_pdf_text(filepath_or_stream)
        if not markdown_text:
            return {"error": "Failed to extract text (Markdown) from PDF."}, None
        
        plain_text_for_general_checks = convert_markdown_to_plain_text(markdown_text)
        cleaned_plain_text_for_regex = re.sub(r'\s+', ' ', plain_text_for_general_checks.replace('\n', ' ')).strip()

        language_and_regex_issue_report = check_language_issues_and_regex(markdown_text)
        
        if "error" in language_and_regex_issue_report:
            return {"error": f"Language/Regex check error: {language_and_regex_issue_report['error']}"}, None
        
        detailed_issues_for_mapping = language_and_regex_issue_report.get("issues_list", [])

        if detailed_issues_for_mapping:
            if isinstance(filepath_or_stream, str):
                pdf_path_for_fitz = filepath_or_stream
            elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
                filepath_or_stream.seek(0)
                temp_fitz_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
                temp_fitz_file_path = temp_fitz_file.name
                temp_fitz_file.write(filepath_or_stream.read())
                temp_fitz_file.close()
                pdf_path_for_fitz = temp_fitz_file_path
            else:
                return {"error": "Invalid PDF input for coordinate mapping."}, None

            try:
                doc_for_mapping = fitz.open(pdf_path_for_fitz)
                if doc_for_mapping.page_count > 0:
                    print(f"\n--- Mapping {len(detailed_issues_for_mapping)} Issues (filtered) to PDF Coordinates ---")
                    if detailed_issues_for_mapping: 
                        for page_idx in range(doc_for_mapping.page_count):
                            page = doc_for_mapping[page_idx]
                            current_page_num_1_based = page_idx + 1
                            
                            unmapped_issues_on_this_page_by_context = defaultdict(list)
                            for issue_dict in detailed_issues_for_mapping:
                                if not issue_dict['is_mapped_to_pdf']:
                                    unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)

                            if not unmapped_issues_on_this_page_by_context:
                                if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
                                continue

                            for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
                                if not ctx_str.strip(): continue
                                try:
                                    pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
                                    if pdf_rects:
                                        try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
                                except Exception as search_exc:
                                    print(f"Warning: Error searching for context '{ctx_str[:30]}' on page {current_page_num_1_based}: {search_exc}")
                        total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
                        print(f"Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
                    else:
                        print("No language/regex issues found within the defined content boundaries to map.")
            except Exception as e_map:
                print(f"Error during PDF coordinate mapping: {e_map}")
                traceback.print_exc()
            finally:
                if doc_for_mapping: doc_for_mapping.close()
                if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
                    os.unlink(temp_fitz_file_path)
        
        final_formatted_issues_list = []
        for issue_data in detailed_issues_for_mapping:
            page_num_for_json = 0
            coords_for_json = []
            if issue_data['is_mapped_to_pdf'] and issue_data['pdf_coordinates_list']:
                coord_dict = issue_data['pdf_coordinates_list'][0] 
                coords_for_json = [coord_dict['x0'], coord_dict['y0'], coord_dict['x1'], coord_dict['y1']]
                page_num_for_json = issue_data['mapped_page_number']
            
            final_formatted_issues_list.append({
                "message": issue_data['message'], "context": issue_data['context_text'], 
                "suggestions": issue_data['replacements_suggestion'], "category": issue_data['category_name'],
                "rule_id": issue_data['ruleId'], "offset": issue_data['offset_in_text'], 
                "length": issue_data['error_length'], "coordinates": coords_for_json,
                "page": page_num_for_json
            })

        results = {
            "issues": final_formatted_issues_list,
            "document_checks": { 
                "metadata": check_metadata(cleaned_plain_text_for_regex),
                "disclosures": check_disclosures(cleaned_plain_text_for_regex),
                "figures_and_tables": check_figures_and_tables(cleaned_plain_text_for_regex),
                "references_summary": check_references_summary(cleaned_plain_text_for_regex), 
                "structure": check_structure(cleaned_plain_text_for_regex),
                "figure_order_analysis": check_figure_order(cleaned_plain_text_for_regex), 
                "reference_order_analysis": check_reference_order(cleaned_plain_text_for_regex), 
                "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_plain_text_for_regex, re.IGNORECASE)),
                "readability_issues_detected": False, 
            }
        }
        
        return results, None 

    except Exception as e:
        print(f"Overall analysis error in analyze_pdf: {e}")
        traceback.print_exc()
        if doc_for_mapping: doc_for_mapping.close()
        if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
            os.unlink(temp_fitz_file_path)
        return {"error": str(e)}, None