texmetrics-regex-checks-gradio-1-devtesting

Running

App Files Files Community

samyak152002 commited on 23 days ago

Commit

b690306

verified ·

1 Parent(s): d37148b

Update main_analyzer.py

Browse files

Files changed (1) hide show

main_analyzer.py +17 -10

main_analyzer.py CHANGED Viewed

@@ -19,8 +19,6 @@ from content_analysis import (
 )
 from language_checker import perform_language_checks
 from regex_checker import perform_regex_checks
-# text_utils.convert_markdown_to_plain_text is used by language_checker
-# config.py is imported in app.py
 def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
     original_pdf_access_path = None
@@ -28,21 +26,32 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
     doc_for_mapping = None
     try:
-        if isinstance(filepath_or_stream, str):
             original_pdf_access_path = filepath_or_stream
         elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
             with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
                 temp_file_for_stream_path = temp_file_obj.name
-                filepath_or_stream.seek(0)
                 temp_file_obj.write(filepath_or_stream.read())
             original_pdf_access_path = temp_file_for_stream_path
-            print(f"Analyzer: Original PDF stream saved to temp file: {original_pdf_access_path}")
         else:
-            return {"error": "Invalid PDF input type. Must be path or file-like object."}, None
         if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
-             return {"error": f"PDF path '{original_pdf_access_path}' does not exist or is invalid."}, None
         # 1. Unfiltered Plain Text (for general and regex checks)
         print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
         raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
@@ -124,10 +133,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
         for issue_data in detailed_issues_for_mapping:
             coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
             coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
-            # Filter out None coordinates that might arise from empty coords dict
             coords_for_json = [c for c in coords_for_json if c is not None]
             final_formatted_issues_list.append({
                 "message": issue_data.get('message', 'N/A'),
                 "context": issue_data.get('context_text', 'N/A'),
@@ -136,7 +143,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
                 "rule_id": issue_data.get('ruleId', 'N/A'),
                 "offset": issue_data.get('offset_in_text', -1),
                 "length": issue_data.get('error_length', 0),
-                "coordinates": coords_for_json if len(coords_for_json) == 4 else [], # Ensure 4 coords or empty
                 "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
                 "source_check_type": issue_data.get('source_check_type', 'N/A')
             })

 )
 from language_checker import perform_language_checks
 from regex_checker import perform_regex_checks
 def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
     original_pdf_access_path = None
     doc_for_mapping = None
     try:
+        if isinstance(filepath_or_stream, str):
             original_pdf_access_path = filepath_or_stream
+            print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
+        # Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
+        elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
+             os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
+            original_pdf_access_path = filepath_or_stream.name
+            print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
+             # If this object also has a .read method, it might be a TemporaryFileWrapper.
+             # The next elif would handle it if we prefer processing it as a stream,
+             # but using its .name path is usually fine and simpler.
         elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
             with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
                 temp_file_for_stream_path = temp_file_obj.name
+                if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
+                    filepath_or_stream.seek(0)
                 temp_file_obj.write(filepath_or_stream.read())
             original_pdf_access_path = temp_file_for_stream_path
+            print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
         else:
+            return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
         if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
+             return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
+        # --- The rest of the function remains the same as the previous complete listing ---
         # 1. Unfiltered Plain Text (for general and regex checks)
         print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
         raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
         for issue_data in detailed_issues_for_mapping:
             coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
             coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
             coords_for_json = [c for c in coords_for_json if c is not None]
             final_formatted_issues_list.append({
                 "message": issue_data.get('message', 'N/A'),
                 "context": issue_data.get('context_text', 'N/A'),
                 "rule_id": issue_data.get('ruleId', 'N/A'),
                 "offset": issue_data.get('offset_in_text', -1),
                 "length": issue_data.get('error_length', 0),
+                "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
                 "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
                 "source_check_type": issue_data.get('source_check_type', 'N/A')
             })