File size: 9,501 Bytes
d07ab72 82c3ba5 d07ab72 070b77e d07ab72 070b77e 2c6cadb 070b77e d07ab72 82c3ba5 d07ab72 070b77e d07ab72 070b77e 82c3ba5 d07ab72 82c3ba5 070b77e b690306 82c3ba5 070b77e b690306 d07ab72 070b77e 82c3ba5 070b77e 82c3ba5 070b77e 82c3ba5 070b77e 2c6cadb 82c3ba5 070b77e d07ab72 070b77e 82c3ba5 070b77e d07ab72 070b77e 82c3ba5 070b77e d07ab72 82c3ba5 070b77e d07ab72 070b77e 82c3ba5 070b77e f9e77fb 070b77e d07ab72 070b77e d07ab72 070b77e f9e77fb 070b77e d07ab72 070b77e 82c3ba5 070b77e d07ab72 82c3ba5 d07ab72 070b77e d07ab72 070b77e 82c3ba5 070b77e 82c3ba5 b690306 070b77e d07ab72 82c3ba5 d07ab72 070b77e d07ab72 82c3ba5 d07ab72 070b77e 82c3ba5 f9e77fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# main_analyzer.py
import fitz # PyMuPDF
import os
import tempfile # Not strictly needed by analyze_pdf for input if app.py handles it
import re
import traceback
from typing import Tuple, Dict, Any, List
from collections import defaultdict
from pdf_processing import (
extract_majority_font_text_directly,
extract_plain_text_from_original_pdf,
try_map_issues_to_page_rects
)
from content_analysis import (
check_metadata, check_disclosures, check_figures_and_tables,
check_references_summary, check_structure,
check_figure_order, check_reference_order
)
from language_checker import perform_language_checks
from regex_checker import perform_regex_checks
def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
original_pdf_access_path = None
# temp_file_for_stream_path is for a scenario where analyze_pdf itself
# might convert a stream input to a temp file. In the Gradio flow,
# app.py provides a path, so this remains None.
temp_file_for_stream_path = None
doc_for_mapping = None
try:
if isinstance(filepath_or_stream, str):
original_pdf_access_path = filepath_or_stream
print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
# NOTE: If filepath_or_stream is NOT a string (e.g., a byte stream was passed directly
# to analyze_pdf without app.py's temp file step), then original_pdf_access_path
# would remain None here, and the check below would fail.
# The fix in app.py ensures original_pdf_access_path gets the temp file path.
if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
pdf_size = os.path.getsize(original_pdf_access_path)
if not raw_unfiltered_plain_text and pdf_size > 0 :
print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
# 2. Font-Filtered Markdown (for LanguageTool checks)
print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
print("markdown font print kar raha hun", markdown_text_from_filtered_pdf) # User's debug print
if not markdown_text_from_filtered_pdf and pdf_size > 0 :
print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
# 3. Perform all checks
document_check_results = {
"metadata": check_metadata(cleaned_unfiltered_plain_text),
"disclosures": check_disclosures(cleaned_unfiltered_plain_text),
"figures_and_tables": check_figures_and_tables(cleaned_unfiltered_plain_text),
"references_summary": check_references_summary(cleaned_unfiltered_plain_text),
"structure": check_structure(cleaned_unfiltered_plain_text),
"figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
"reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
"plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
"readability_issues_detected": False,
}
print("Analyzer: Performing regex checks...")
regex_report = perform_regex_checks(cleaned_unfiltered_plain_text)
if "error" in regex_report: print(f"Analyzer: Error in regex checks: {regex_report['error']}")
regex_issues = regex_report.get("issues_list", [])
print("Analyzer: Performing language checks...")
lt_report = perform_language_checks(markdown_text_from_filtered_pdf)
if "error" in lt_report: print(f"Analyzer: Error in LanguageTool checks: {lt_report['error']}")
lt_issues = lt_report.get("issues_list", [])
detailed_issues_for_mapping = regex_issues + lt_issues
# 4. Coordinate Mapping (against the original PDF)
if detailed_issues_for_mapping:
try:
# Use original_pdf_access_path which now holds the path to the (potentially temporary) PDF
doc_for_mapping = fitz.open(original_pdf_access_path)
if doc_for_mapping.page_count > 0:
print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
# ... (rest of mapping logic as before) ...
for page_idx in range(doc_for_mapping.page_count):
page = doc_for_mapping[page_idx] # Current PyMuPDF page object
current_page_num_1_based = page_idx + 1
unmapped_issues_on_this_page_by_context = defaultdict(list)
for issue_dict in detailed_issues_for_mapping:
if not issue_dict['is_mapped_to_pdf']:
unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
if not unmapped_issues_on_this_page_by_context:
if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
continue
for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
if not ctx_str or not ctx_str.strip(): continue
try:
pdf_rects_for_context_occurrences = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
if pdf_rects_for_context_occurrences:
try_map_issues_to_page_rects(
issues_for_ctx,
pdf_rects_for_context_occurrences,
current_page_num_1_based,
page # Pass the current page object
)
except Exception as search_exc:
print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
except Exception as e_map:
print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
# ensure doc_for_mapping is closed in the main finally block
else:
print("Analyzer: No detailed issues from regex or language checks to map.")
# 5. Format final list of issues
final_formatted_issues_list = []
# ... (rest of formatting logic as before) ...
for issue_data in detailed_issues_for_mapping:
coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
coords_for_json = [c for c in coords_for_json if c is not None]
final_formatted_issues_list.append({
"message": issue_data.get('message', 'N/A'),
"context": issue_data.get('context_text', 'N/A'),
"suggestions": issue_data.get('replacements_suggestion', []),
"category": issue_data.get('category_name', 'Unknown'),
"rule_id": issue_data.get('ruleId', 'N/A'),
"offset": issue_data.get('offset_in_text', -1),
"length": issue_data.get('error_length', 0),
"coordinates": coords_for_json if len(coords_for_json) == 4 else [],
"page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
"source_check_type": issue_data.get('source_check_type', 'N/A')
})
results = {
"issues": final_formatted_issues_list,
"document_checks": document_check_results
}
return results, None
except Exception as e:
print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
return {"error": f"Overall analysis error: {str(e)}"}, None
finally:
# This finally block is for resources opened *within* analyze_pdf.
# The temp file created by app.py is managed by app.py.
# The temp_file_for_stream_path logic was for a temp file created by analyze_pdf
# itself if it received a stream; this isn't happening in the Gradio flow.
if doc_for_mapping: # Ensure the fitz document for mapping is closed
doc_for_mapping.close()
print(f"Analyzer: Closed fitz document used for mapping.") |