texmetrics-regex-checks-gradio-1-devtesting

Running

App Files Files Community

Samyak-Meesho commited on 26 days ago

Commit

2c6cadb

1 Parent(s): b690306

changed code

Browse files

Files changed (3) hide show

language_checker.py +38 -6
main_analyzer.py +3 -22
pdf_processing.py +107 -65

language_checker.py CHANGED Viewed

@@ -62,25 +62,57 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
         lt_issues_in_range = 0
         for idx, match in enumerate(raw_lt_matches):
-            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
             if not (content_start_index <= match.offset < content_end_index):
                 continue
-            lt_issues_in_range +=1
-            context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
             processed_lt_issues.append({
                 '_internal_id': f"lt_{idx}",
                 'ruleId': match.ruleId,
                 'message': match.message,
-                'context_text': context_str,
                 'offset_in_text': match.offset,
                 'error_length': match.errorLength,
                 'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                 'category_name': match.category,
                 'source_check_type': 'LanguageTool',
                 'is_mapped_to_pdf': False,
-                'pdf_coordinates_list': [],
                 'mapped_page_number': -1
             })
         print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")

         lt_issues_in_range = 0
         for idx, match in enumerate(raw_lt_matches):
+            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue  # Common rule to ignore
             if not (content_start_index <= match.offset < content_end_index):
                 continue
+            lt_issues_in_range += 1
+            # Current context extraction:
+            # context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
+            # New context extraction for ~10 words:
+            words_around = 1  # Number of words to try and get on each side
+            # Text before the error
+            pre_error_text = text_for_lt_analysis[:match.offset]
+            words_before = pre_error_text.split()[-words_around:]
+            # Text of the error itself
+            error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
+            # Text after the error
+            post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
+            words_after = post_error_text.split()[:words_around]
+            # Combine to form the new wider context
+            context_parts = []
+            if words_before:
+                context_parts.append(" ".join(words_before))
+            context_parts.append(error_text)  # The actual error phrase
+            if words_after:
+                context_parts.append(" ".join(words_after))
+            wider_context_str = " ".join(context_parts)
+            # Ensure there's a small buffer around the error to make it ~10 words total if error is short
+            # This can be refined further based on average word length or by counting words more precisely.
+            # A simpler approach using character offsets could also be used, e.g.:
+            # context_start_char = max(0, match.offset - 50) # Approx 50 chars before
+            # context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
+            # wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
             processed_lt_issues.append({
                 '_internal_id': f"lt_{idx}",
                 'ruleId': match.ruleId,
                 'message': match.message,
+                'context_text': wider_context_str,  # Use the new wider context
                 'offset_in_text': match.offset,
                 'error_length': match.errorLength,
                 'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                 'category_name': match.category,
                 'source_check_type': 'LanguageTool',
                 'is_mapped_to_pdf': False,
+                'pdf_coordinates_list': [],
                 'mapped_page_number': -1
             })
         print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")

main_analyzer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Tuple, Dict, Any, List
 from collections import defaultdict
 from pdf_processing import (
-    extract_font_filtered_markdown,
     extract_plain_text_from_original_pdf,
     try_map_issues_to_page_rects
 )
@@ -29,30 +29,10 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
         if isinstance(filepath_or_stream, str):
             original_pdf_access_path = filepath_or_stream
             print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
-        # Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
-        elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
-             os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
-            original_pdf_access_path = filepath_or_stream.name
-            print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
-             # If this object also has a .read method, it might be a TemporaryFileWrapper.
-             # The next elif would handle it if we prefer processing it as a stream,
-             # but using its .name path is usually fine and simpler.
-        elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
-            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
-                temp_file_for_stream_path = temp_file_obj.name
-                if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
-                    filepath_or_stream.seek(0)
-                temp_file_obj.write(filepath_or_stream.read())
-            original_pdf_access_path = temp_file_for_stream_path
-            print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
-        else:
-            return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
         if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
              return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
-        # --- The rest of the function remains the same as the previous complete listing ---
-        # 1. Unfiltered Plain Text (for general and regex checks)
         print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
         raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
@@ -64,7 +44,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
         # 2. Font-Filtered Markdown (for LanguageTool checks)
         print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
-        markdown_text_from_filtered_pdf = extract_font_filtered_markdown(original_pdf_access_path)
         if not markdown_text_from_filtered_pdf and pdf_size > 0 :
             print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")

 from collections import defaultdict
 from pdf_processing import (
+    extract_majority_font_text_directly,
     extract_plain_text_from_original_pdf,
     try_map_issues_to_page_rects
 )
         if isinstance(filepath_or_stream, str):
             original_pdf_access_path = filepath_or_stream
             print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
         if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
              return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
         print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
         raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
         # 2. Font-Filtered Markdown (for LanguageTool checks)
         print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
+        markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
+        print("markdown font print kar raha hun", markdown_text_from_filtered_pdf)
         if not markdown_text_from_filtered_pdf and pdf_size > 0 :
             print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")

pdf_processing.py CHANGED Viewed

@@ -35,90 +35,132 @@ def try_map_issues_to_page_rects(
             mapped_count += 1
     return mapped_count
-def extract_font_filtered_markdown(pdf_path: str) -> str:
     """
-    Extracts text from PDF at pdf_path, filters by majority font,
-    builds a new PDF in memory, and converts it to Markdown using PyMuPDF4LLM.
-    Expects pdf_path to be a valid path to a PDF file.
     """
     original_doc = None
-    new_doc = None
     try:
         original_doc = fitz.open(pdf_path)
         if not original_doc.page_count:
-            print("FontFilter: PDF has no pages.")
             return ""
-        all_spans_details: List[Dict[str, Any]] = []
         font_char_counts: Counter = Counter()
         pdf_basename = os.path.basename(pdf_path)
-        print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
-        for page_num in range(original_doc.page_count):
-            page = original_doc[page_num]
-            text_dict = page.get_text("dict")
-            for block in text_dict.get("blocks", []):
-                if block.get("type") == 0:
-                    for line in block.get("lines", []):
-                        for span in line.get("spans", []):
-                            font_name = span["font"]
-                            font_size_rounded = int(round(span["size"]))
-                            text = span["text"]
-                            span_detail = {
-                                "text": text, "font_name": font_name,
-                                "font_size_rounded": font_size_rounded,
-                                "original_font_size": span["size"],
-                                "bbox": span["bbox"], "page_num": page_num
-                            }
-                            all_spans_details.append(span_detail)
                             font_char_counts[(font_name, font_size_rounded)] += len(text)
         if not font_char_counts:
-            print("FontFilter: No text with font information found in PDF.")
             return ""
         majority_font_tuple_info = font_char_counts.most_common(1)[0]
         (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
-        char_count = majority_font_tuple_info[1]
-        print(f"FontFilter: Majority font: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count} chars).")
-        new_doc = fitz.Document()
-        # print("FontFilter: Constructing new PDF with majority font text...") # Can be verbose
-        for p_num in range(original_doc.page_count):
-            original_page_for_dim = original_doc[p_num]
-            new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width,
-                                            height=original_page_for_dim.rect.height)
-            spans_to_write = [
-                s_detail for s_detail in all_spans_details
-                if s_detail["page_num"] == p_num and \
-                   s_detail["font_name"] == majority_font_name and \
-                   s_detail["font_size_rounded"] == majority_font_size_rounded
-            ]
-            for span_data in spans_to_write:
-                text_to_insert = span_data["text"]
-                original_bbox = fitz.Rect(span_data["bbox"])
-                font_size_for_render = span_data["original_font_size"]
-                new_pdf_page.insert_textbox(
-                    original_bbox, text_to_insert, fontsize=font_size_for_render,
-                    fontname="helv", align=0
-                ) # Ignoring insertion_result for brevity here
-        # print(f"FontFilter: New PDF constructed with {new_doc.page_count} pages.")
-        markdown_text = ""
-        if new_doc.page_count > 0:
-            # print(f"FontFilter: Converting filtered PDF Document object to Markdown...") # Verbose
-            markdown_text = pymupdf4llm.to_markdown(new_doc)
-        else:
-            print("FontFilter: The new PDF (filtered) is empty. No markdown generated.")
-        # print(f"FontFilter: Markdown from filtered PDF length: {len(markdown_text)} chars.")
-        return markdown_text
     except Exception as e:
-        print(f"Error in extract_font_filtered_markdown for '{pdf_path}': {e}\n{traceback.format_exc()}")
         return ""
     finally:
         if original_doc: original_doc.close()
-        if new_doc: new_doc.close()
 def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
     """
@@ -129,7 +171,7 @@ def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
     try:
         doc_orig_text = fitz.open(pdf_path)
         full_text_parts = [page.get_text("text") for page in doc_orig_text]
-        # print(f"OriginalTextExtract: Extracted {len(doc_orig_text.page_count)} pages of plain text from '{os.path.basename(pdf_path)}'.")
         return "".join(full_text_parts)
     except Exception as e:
         print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")

             mapped_count += 1
     return mapped_count
+import fitz  # PyMuPDF
+import os
+import traceback
+from typing import Any, Dict, List
+from collections import Counter
+# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
+import fitz  # PyMuPDF
+import os
+import traceback
+from typing import Any, Dict, List  # Use standard List, Dict
+from collections import Counter
+# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
+def extract_majority_font_text_directly(pdf_path: str) -> str:
     """
+    Extracts text from PDF, identifies the majority font and size,
+    and then directly assembles a plain text string containing only the text
+    that matches this majority font, attempting to preserve basic structure.
+    This method does NOT create an intermediate PDF document.
     """
     original_doc = None
     try:
+        # 1. Open PDF and Perform Font Analysis (similar to before)
         original_doc = fitz.open(pdf_path)
         if not original_doc.page_count:
+            print("FontFilter (Direct): PDF has no pages.")
             return ""
         font_char_counts: Counter = Counter()
         pdf_basename = os.path.basename(pdf_path)
+        print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
+        # First pass: Analyze fonts to find the majority
+        for page_num_analysis in range(original_doc.page_count):
+            page_analysis = original_doc[page_num_analysis]
+            # Using TEXTFLAGS_TEXT for potentially cleaner text from spans
+            text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
+            for block_analysis in text_dict_analysis.get("blocks", []):
+                if block_analysis.get("type") == 0:  # type 0 is a text block
+                    for line_analysis in block_analysis.get("lines", []):
+                        for span_analysis in line_analysis.get("spans", []):
+                            font_name = span_analysis["font"]
+                            font_size = span_analysis.get("size")
+                            if font_size is None: continue  # Skip if size is not available
+                            font_size_rounded = int(round(font_size))
+                            text = span_analysis["text"]
+                            if not text.strip(): continue  # Skip purely whitespace spans
                             font_char_counts[(font_name, font_size_rounded)] += len(text)
         if not font_char_counts:
+            print("FontFilter (Direct): No text with font information found in PDF.")
             return ""
         majority_font_tuple_info = font_char_counts.most_common(1)[0]
         (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
+        char_count_for_majority = majority_font_tuple_info[1]
+        print(
+            f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")
+        # 2. Second Pass: Extract and Assemble Text Based on Majority Font
+        print(
+            f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
+        all_pages_collected_text = []  # List to hold text from each page (as a list of block texts)
+        for page_num_extraction in range(original_doc.page_count):
+            page = original_doc[page_num_extraction]
+            # Using flags for potentially better whitespace and ligature handling in extracted text
+            text_page_dict = page.get_text("dict",
+                                           flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
+            page_blocks_text_parts = []  # Collect text from blocks on this page
+            for block in text_page_dict.get("blocks", []):
+                if block.get("type") == 0:  # Text block
+                    current_block_lines_text_parts = []
+                    for line in block.get("lines", []):
+                        current_line_spans_text_parts = []
+                        for span in line.get("spans", []):
+                            # Check if this span matches the majority font
+                            current_span_font_name = span["font"]
+                            current_span_font_size = span.get("size")
+                            if current_span_font_size is not None and \
+                                    current_span_font_name == majority_font_name and \
+                                    int(round(current_span_font_size)) == majority_font_size_rounded:
+                                current_line_spans_text_parts.append(span["text"])
+                        if current_line_spans_text_parts:
+                            # Join text from selected spans within a line with a single space
+                            line_text = " ".join(current_line_spans_text_parts)
+                            current_block_lines_text_parts.append(line_text)
+                    if current_block_lines_text_parts:
+                        # Join lines within a block with a single newline
+                        block_text = "\n".join(current_block_lines_text_parts)
+                        page_blocks_text_parts.append(block_text)
+            if page_blocks_text_parts:
+                # Join blocks on a page with a double newline (simulating paragraph breaks)
+                all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))
+        if not all_pages_collected_text:
+            print("FontFilter (Direct): No text matching the majority font was found to extract.")
+            return ""
+        # Join text from all pages.
+        # A page break is already handled by the \n\n between blocks of different pages.
+        # If more distinct page separation is needed, a custom separator could be added here.
+        final_text = "\n\n".join(all_pages_collected_text)
+        print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
+        return final_text
     except Exception as e:
+        print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
         return ""
     finally:
         if original_doc: original_doc.close()
 def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
     """
     try:
         doc_orig_text = fitz.open(pdf_path)
         full_text_parts = [page.get_text("text") for page in doc_orig_text]
+        print(full_text_parts)
         return "".join(full_text_parts)
     except Exception as e:
         print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")