texmetrics-regex-checks-gradio-1-devtesting

Running

texmetrics-regex-checks-gradio-1-devtesting

File size: 9,932 Bytes

d07ab72
 
 
82c3ba5
d07ab72
 
070b77e
d07ab72
 
070b77e
2c6cadb
070b77e
 
 
d07ab72
 
82c3ba5
d07ab72
 
070b77e
 
d07ab72
 
070b77e
82c3ba5
 
 
 
d07ab72
 
 
82c3ba5
070b77e
b690306
82c3ba5
 
 
 
070b77e
 
b690306
d07ab72
070b77e
 
82c3ba5
070b77e
 
 
82c3ba5
070b77e
82c3ba5
070b77e
 
2c6cadb
82c3ba5
070b77e
 
d07ab72
070b77e
 
 
 
 
 
 
 
 
 
82c3ba5
070b77e
 
 
 
 
 
 
 
 
 
 
d07ab72
070b77e
82c3ba5
070b77e
 
d07ab72
82c3ba5
070b77e
d07ab72
070b77e
82c3ba5
070b77e
 
 
 
 
 
 
d07ab72
070b77e
 
 
d07ab72
070b77e
 
 
 
 
 
 
 
 
 
d07ab72
070b77e
82c3ba5
070b77e
 
 
 
d07ab72
82c3ba5
d07ab72
070b77e
 
 
 
d07ab72
070b77e
82c3ba5
070b77e
 
 
82c3ba5
 
b690306
070b77e
 
d07ab72
82c3ba5
d07ab72
 
070b77e
d07ab72
82c3ba5
d07ab72
070b77e
 
 
82c3ba5

# main_analyzer.py
import fitz  # PyMuPDF
import os
import tempfile # Not strictly needed by analyze_pdf for input if app.py handles it
import re
import traceback
from typing import Tuple, Dict, Any, List
from collections import defaultdict

from pdf_processing import (
    extract_majority_font_text_directly,
    extract_plain_text_from_original_pdf,
    try_map_issues_to_page_rects
)
from content_analysis import (
    check_metadata, check_disclosures, check_figures_and_tables,
    check_references_summary, check_structure,
    check_figure_order, check_reference_order
)
from language_checker import perform_language_checks
from regex_checker import perform_regex_checks

def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
    original_pdf_access_path = None
    # temp_file_for_stream_path is for a scenario where analyze_pdf itself
    # might convert a stream input to a temp file. In the Gradio flow,
    # app.py provides a path, so this remains None.
    temp_file_for_stream_path = None
    doc_for_mapping = None

    try:
        if isinstance(filepath_or_stream, str):
            original_pdf_access_path = filepath_or_stream
            print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
        # NOTE: If filepath_or_stream is NOT a string (e.g., a byte stream was passed directly
        # to analyze_pdf without app.py's temp file step), then original_pdf_access_path
        # would remain None here, and the check below would fail.
        # The fix in app.py ensures original_pdf_access_path gets the temp file path.

        if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
             return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None

        print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
        raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)

        pdf_size = os.path.getsize(original_pdf_access_path)
        if not raw_unfiltered_plain_text and pdf_size > 0 :
             print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")

        cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()

        # 2. Font-Filtered Markdown (for LanguageTool checks)
        print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
        markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
        print("markdown font print kar raha hun", markdown_text_from_filtered_pdf) # User's debug print
        if not markdown_text_from_filtered_pdf and pdf_size > 0 :
            print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")

        # 3. Perform all checks
        document_check_results = {
            "metadata": check_metadata(cleaned_unfiltered_plain_text),
            "disclosures": check_disclosures(cleaned_unfiltered_plain_text),
            "figures_and_tables": check_figures_and_tables(cleaned_unfiltered_plain_text),
            "references_summary": check_references_summary(cleaned_unfiltered_plain_text),
            "structure": check_structure(cleaned_unfiltered_plain_text),
            "figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
            "reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
            "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
            "readability_issues_detected": False,
        }

        print("Analyzer: Performing regex checks...")
        regex_report = perform_regex_checks(cleaned_unfiltered_plain_text)
        if "error" in regex_report: print(f"Analyzer: Error in regex checks: {regex_report['error']}")
        regex_issues = regex_report.get("issues_list", [])

        print("Analyzer: Performing language checks...")
        lt_report = perform_language_checks(markdown_text_from_filtered_pdf)
        if "error" in lt_report: print(f"Analyzer: Error in LanguageTool checks: {lt_report['error']}")
        lt_issues = lt_report.get("issues_list", [])

        detailed_issues_for_mapping = regex_issues + lt_issues

        # 4. Coordinate Mapping (against the original PDF)
        if detailed_issues_for_mapping:
            try:
                # Use original_pdf_access_path which now holds the path to the (potentially temporary) PDF
                doc_for_mapping = fitz.open(original_pdf_access_path)
                if doc_for_mapping.page_count > 0:
                    print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
                    # ... (rest of mapping logic as before) ...
                    for page_idx in range(doc_for_mapping.page_count):
                        page = doc_for_mapping[page_idx]
                        current_page_num_1_based = page_idx + 1
                        unmapped_issues_on_this_page_by_context = defaultdict(list)
                        for issue_dict in detailed_issues_for_mapping:
                            if not issue_dict['is_mapped_to_pdf']:
                                unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)

                        if not unmapped_issues_on_this_page_by_context:
                            if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
                            continue

                        for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
                            if not ctx_str or not ctx_str.strip(): continue
                            try:
                                pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
                                if pdf_rects:
                                    try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
                            except Exception as search_exc:
                                print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
                    total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
                    print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
            except Exception as e_map:
                print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
            # ensure doc_for_mapping is closed in the main finally block
        else:
            print("Analyzer: No detailed issues from regex or language checks to map.")

        # 5. Format final list of issues
        final_formatted_issues_list = []
        # ... (rest of formatting logic as before) ...
        for issue_data in detailed_issues_for_mapping:
            coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
            coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
            coords_for_json = [c for c in coords_for_json if c is not None]

            final_formatted_issues_list.append({
                "message": issue_data.get('message', 'N/A'),
                "context": issue_data.get('context_text', 'N/A'),
                "suggestions": issue_data.get('replacements_suggestion', []),
                "category": issue_data.get('category_name', 'Unknown'),
                "rule_id": issue_data.get('ruleId', 'N/A'),
                "offset": issue_data.get('offset_in_text', -1),
                "length": issue_data.get('error_length', 0),
                "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
                "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
                "source_check_type": issue_data.get('source_check_type', 'N/A')
            })

        results = {
            "issues": final_formatted_issues_list,
            "document_checks": document_check_results
        }
        return results, None
    except Exception as e:
        print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
        return {"error": f"Overall analysis error: {str(e)}"}, None
    finally:
        # This finally block is for resources opened *within* analyze_pdf.
        # The temp file created by app.py is managed by app.py.
        # The temp_file_for_stream_path logic was for a temp file created by analyze_pdf
        # itself if it received a stream; this isn't happening in the Gradio flow.
        if doc_for_mapping: # Ensure the fitz document for mapping is closed
            doc_for_mapping.close()
            print(f"Analyzer: Closed fitz document used for mapping.")

        # The original finally block for temp_file_for_stream_path:
        # if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
        #     try:
        #         os.remove(temp_file_for_stream_path)
        #         print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
        #     except Exception as e_clean:
        #         print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")
        # This part is removed because temp_file_for_stream_path is never assigned a value
        # in the current structure of analyze_pdf. If analyze_pdf were to handle streams
        # by creating its own temp file, then this cleanup would be relevant for that temp file.