# pdf_processing.py
import fitz  # PyMuPDF
import pymupdf4llm
import os
import traceback
from typing import Any, Dict, List # Use standard List, Dict
from collections import Counter

def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
    """Converts a fitz.Rect object to a dictionary."""
    if not rect or not isinstance(rect, fitz.Rect):
        # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
        return None
    return {
        "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
        "width": rect.width, "height": rect.height
    }

def try_map_issues_to_page_rects(
    issues_to_map_for_context: List[Dict[str, Any]],
    pdf_rects: List[fitz.Rect],
    page_number_for_mapping: int
) -> int:
    mapped_count = 0
    limit = min(len(issues_to_map_for_context), len(pdf_rects))
    for i in range(limit):
        issue_to_update = issues_to_map_for_context[i]
        if issue_to_update['is_mapped_to_pdf']: continue
        pdf_rect = pdf_rects[i]
        coord_dict = convert_rect_to_dict(pdf_rect)
        if coord_dict:
            issue_to_update['pdf_coordinates_list'] = [coord_dict]
            issue_to_update['is_mapped_to_pdf'] = True
            issue_to_update['mapped_page_number'] = page_number_for_mapping
            mapped_count += 1
    return mapped_count


import fitz  # PyMuPDF
import os
import traceback
from typing import Any, Dict, List
from collections import Counter


# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.

import fitz  # PyMuPDF
import os
import traceback
from typing import Any, Dict, List  # Use standard List, Dict
from collections import Counter


# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.

def extract_majority_font_text_directly(pdf_path: str) -> str:
    """
    Extracts text from PDF, identifies the majority font and size,
    and then directly assembles a plain text string containing only the text
    that matches this majority font, attempting to preserve basic structure.
    This method does NOT create an intermediate PDF document.
    """
    original_doc = None
    try:
        # 1. Open PDF and Perform Font Analysis (similar to before)
        original_doc = fitz.open(pdf_path)
        if not original_doc.page_count:
            print("FontFilter (Direct): PDF has no pages.")
            return ""

        font_char_counts: Counter = Counter()
        pdf_basename = os.path.basename(pdf_path)
        print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")

        # First pass: Analyze fonts to find the majority
        for page_num_analysis in range(original_doc.page_count):
            page_analysis = original_doc[page_num_analysis]
            # Using TEXTFLAGS_TEXT for potentially cleaner text from spans
            text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
            for block_analysis in text_dict_analysis.get("blocks", []):
                if block_analysis.get("type") == 0:  # type 0 is a text block
                    for line_analysis in block_analysis.get("lines", []):
                        for span_analysis in line_analysis.get("spans", []):
                            font_name = span_analysis["font"]
                            font_size = span_analysis.get("size")
                            if font_size is None: continue  # Skip if size is not available

                            font_size_rounded = int(round(font_size))
                            text = span_analysis["text"]
                            if not text.strip(): continue  # Skip purely whitespace spans

                            font_char_counts[(font_name, font_size_rounded)] += len(text)

        if not font_char_counts:
            print("FontFilter (Direct): No text with font information found in PDF.")
            return ""

        majority_font_tuple_info = font_char_counts.most_common(1)[0]
        (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
        char_count_for_majority = majority_font_tuple_info[1]
        print(
            f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")

        # 2. Second Pass: Extract and Assemble Text Based on Majority Font
        print(
            f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
        all_pages_collected_text = []  # List to hold text from each page (as a list of block texts)

        for page_num_extraction in range(original_doc.page_count):
            page = original_doc[page_num_extraction]
            # Using flags for potentially better whitespace and ligature handling in extracted text
            text_page_dict = page.get_text("dict",
                                           flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)

            page_blocks_text_parts = []  # Collect text from blocks on this page

            for block in text_page_dict.get("blocks", []):
                if block.get("type") == 0:  # Text block
                    current_block_lines_text_parts = []
                    for line in block.get("lines", []):
                        current_line_spans_text_parts = []
                        for span in line.get("spans", []):
                            # Check if this span matches the majority font
                            current_span_font_name = span["font"]
                            current_span_font_size = span.get("size")

                            if current_span_font_size is not None and \
                                    current_span_font_name == majority_font_name and \
                                    int(round(current_span_font_size)) == majority_font_size_rounded:
                                current_line_spans_text_parts.append(span["text"])

                        if current_line_spans_text_parts:
                            # Join text from selected spans within a line with a single space
                            line_text = " ".join(current_line_spans_text_parts)
                            current_block_lines_text_parts.append(line_text)

                    if current_block_lines_text_parts:
                        # Join lines within a block with a single newline
                        block_text = "\n".join(current_block_lines_text_parts)
                        page_blocks_text_parts.append(block_text)

            if page_blocks_text_parts:
                # Join blocks on a page with a double newline (simulating paragraph breaks)
                all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))

        if not all_pages_collected_text:
            print("FontFilter (Direct): No text matching the majority font was found to extract.")
            return ""

        # Join text from all pages.
        # A page break is already handled by the \n\n between blocks of different pages.
        # If more distinct page separation is needed, a custom separator could be added here.
        final_text = "\n\n".join(all_pages_collected_text)
        print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
        return final_text

    except Exception as e:
        print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if original_doc: original_doc.close()


def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
    """
    Extracts raw plain text from the PDF at pdf_path without any filtering.
    Expects pdf_path to be a valid path to a PDF file.
    """
    doc_orig_text = None
    try:
        doc_orig_text = fitz.open(pdf_path)
        full_text_parts = [page.get_text("text") for page in doc_orig_text]
        print(full_text_parts)
        return "".join(full_text_parts)
    except Exception as e:
        print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
        return ""
    finally:
        if doc_orig_text: doc_orig_text.close()