File size: 8,288 Bytes
d07ab72
 
 
 
 
 
070b77e
d07ab72
 
070b77e
2c6cadb
070b77e
 
 
d07ab72
 
070b77e
d07ab72
 
070b77e
 
d07ab72
 
070b77e
 
d07ab72
 
 
b690306
070b77e
b690306
070b77e
 
b690306
d07ab72
070b77e
 
 
 
 
 
d07ab72
070b77e
d07ab72
070b77e
 
2c6cadb
 
070b77e
 
d07ab72
070b77e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d07ab72
070b77e
 
 
 
d07ab72
070b77e
d07ab72
070b77e
 
 
 
 
 
 
 
d07ab72
070b77e
 
 
d07ab72
070b77e
 
 
 
 
 
 
 
 
 
d07ab72
070b77e
d07ab72
 
070b77e
 
 
 
d07ab72
 
070b77e
 
 
 
d07ab72
070b77e
 
 
 
 
 
 
b690306
070b77e
 
d07ab72
070b77e
d07ab72
 
070b77e
d07ab72
 
 
070b77e
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# main_analyzer.py
import fitz  # PyMuPDF
import os
import tempfile
import re
import traceback
from typing import Tuple, Dict, Any, List
from collections import defaultdict

from pdf_processing import (
    extract_majority_font_text_directly,
    extract_plain_text_from_original_pdf,
    try_map_issues_to_page_rects
)
from content_analysis import (
    check_metadata, check_disclosures, check_figures_and_tables,
    check_references_summary, check_structure, 
    check_figure_order, check_reference_order
)
from language_checker import perform_language_checks
from regex_checker import perform_regex_checks

def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
    original_pdf_access_path = None
    temp_file_for_stream_path = None 
    doc_for_mapping = None

    try:
        if isinstance(filepath_or_stream, str): 
            original_pdf_access_path = filepath_or_stream
            print(f"Analyzer: Input is a string path: {original_pdf_access_path}")

        if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
             return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None

        print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
        raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
        
        pdf_size = os.path.getsize(original_pdf_access_path)
        if not raw_unfiltered_plain_text and pdf_size > 0 :
             print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
        
        cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
        
        # 2. Font-Filtered Markdown (for LanguageTool checks)
        print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
        markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
        print("markdown font print kar raha hun", markdown_text_from_filtered_pdf)
        if not markdown_text_from_filtered_pdf and pdf_size > 0 :
            print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")

        # 3. Perform all checks
        document_check_results = {
            "metadata": check_metadata(cleaned_unfiltered_plain_text),
            "disclosures": check_disclosures(cleaned_unfiltered_plain_text),
            "figures_and_tables": check_figures_and_tables(cleaned_unfiltered_plain_text),
            "references_summary": check_references_summary(cleaned_unfiltered_plain_text),
            "structure": check_structure(cleaned_unfiltered_plain_text),
            "figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
            "reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
            "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
            "readability_issues_detected": False, 
        }

        print("Analyzer: Performing regex checks...")
        regex_report = perform_regex_checks(cleaned_unfiltered_plain_text)
        if "error" in regex_report: print(f"Analyzer: Error in regex checks: {regex_report['error']}")
        regex_issues = regex_report.get("issues_list", [])

        print("Analyzer: Performing language checks...")
        lt_report = perform_language_checks(markdown_text_from_filtered_pdf)
        if "error" in lt_report: print(f"Analyzer: Error in LanguageTool checks: {lt_report['error']}")
        lt_issues = lt_report.get("issues_list", [])

        detailed_issues_for_mapping = regex_issues + lt_issues
        
        # 4. Coordinate Mapping (against the original PDF)
        if detailed_issues_for_mapping:
            try:
                doc_for_mapping = fitz.open(original_pdf_access_path)
                if doc_for_mapping.page_count > 0:
                    print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
                    for page_idx in range(doc_for_mapping.page_count):
                        page = doc_for_mapping[page_idx]
                        current_page_num_1_based = page_idx + 1
                        unmapped_issues_on_this_page_by_context = defaultdict(list)
                        for issue_dict in detailed_issues_for_mapping:
                            if not issue_dict['is_mapped_to_pdf']:
                                unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)

                        if not unmapped_issues_on_this_page_by_context:
                            if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
                            continue

                        for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
                            if not ctx_str or not ctx_str.strip(): continue
                            try:
                                pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
                                if pdf_rects:
                                    try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
                            except Exception as search_exc:
                                print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
                    total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
                    print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
            except Exception as e_map:
                print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
            finally:
                if doc_for_mapping: doc_for_mapping.close()
        else:
            print("Analyzer: No detailed issues from regex or language checks to map.")

        # 5. Format final list of issues
        final_formatted_issues_list = []
        for issue_data in detailed_issues_for_mapping:
            coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
            coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
            coords_for_json = [c for c in coords_for_json if c is not None]

            final_formatted_issues_list.append({
                "message": issue_data.get('message', 'N/A'),
                "context": issue_data.get('context_text', 'N/A'), 
                "suggestions": issue_data.get('replacements_suggestion', []),
                "category": issue_data.get('category_name', 'Unknown'),
                "rule_id": issue_data.get('ruleId', 'N/A'),
                "offset": issue_data.get('offset_in_text', -1), 
                "length": issue_data.get('error_length', 0),   
                "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
                "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
                "source_check_type": issue_data.get('source_check_type', 'N/A')
            })
        
        results = {
            "issues": final_formatted_issues_list,
            "document_checks": document_check_results
        }
        return results, None 
    except Exception as e:
        print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
        return {"error": f"Overall analysis error: {str(e)}"}, None
    finally:
        if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
            try:
                os.remove(temp_file_for_stream_path)
                print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
            except Exception as e_clean:
                print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")