Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Running

App Files Files Community

code alignment

by samyak152002 - opened May 13

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+24

-79

This PR is in draft mode

Files changed (2) hide show

README.md +1 -1
app.py +23 -78

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🏆
 colorFrom: pink
 colorTo: purple
 sdk: gradio
-sdk_version: 5.31.0
 app_file: app.py
 pinned: false
 ---

 colorFrom: pink
 colorTo: purple
 sdk: gradio
+sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -19,10 +19,6 @@ import gradio as gr
 # Set JAVA_HOME environment variable (from target script)
 os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
-global_constants = {
-    "CONTEXT_LENGTH" : 3
-}
 # --- Functions for PDF to Markdown to Plain Text ---
 def convert_markdown_to_plain_text(markdown_text: str) -> str:
@@ -179,7 +175,6 @@ def check_structure(plain_text: str) -> Dict[str, bool]:
         "abstract_structure": "structured abstract" in text_lower
     }
 def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
     """
     Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
@@ -233,31 +228,20 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
     tool = None
     processed_issues: List[Dict[str, Any]] = []
     try:
-        tool = language_tool_python.LanguageTool('en-US')
-        print(text_for_analysis)
         raw_lt_matches = tool.check(text_for_analysis)
-        # Define a set of rule IDs to ignore
-        rules_to_ignore = {
-            "EN_SPLIT_WORDS_HYPHEN",      # Existing rule to ignore
-            "MORFOLOGIK_RULE_EN_US"       # New rule to ignore for spelling mistakes
-        }
         lt_issues_in_range = 0
         for idx, match in enumerate(raw_lt_matches):
-            # Skip if the ruleId is in our set of ignored rules
-            if match.ruleId in rules_to_ignore:
-                continue
             # Filter by content boundaries
             if not (content_start_index <= match.offset < content_end_index):
                 continue
             lt_issues_in_range +=1
-            context_str = text_for_analysis[match.offset - global_constants["CONTEXT_LENGTH"] : match.offset + match.errorLength + global_constants["CONTEXT_LENGTH"]]
             processed_issues.append({
                 '_internal_id': f"lt_{idx}",
                 'ruleId': match.ruleId,
@@ -314,69 +298,30 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
 def check_figure_order(plain_text: str) -> Dict[str, Any]:
     figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
-    # Find all matches; re.IGNORECASE ensures "figure", "Figure", "FIGURE" are caught
-    figure_references_raw = re.findall(figure_pattern, plain_text, re.IGNORECASE)
-    # Convert captured numbers (group 1 of the regex) to integers
-    # Only include if the captured string is indeed a digit.
-    valid_figure_numbers_int: List[int] = []
-    for num_str in figure_references_raw:
         if num_str.isdigit():
             valid_figure_numbers_int.append(int(num_str))
-        # else:
-            # Optional: log or handle non-digit captures if the regex could allow them
-            # print(f"Warning: Figure regex captured non-digit '{num_str}'")
-    if not valid_figure_numbers_int:
-        # No valid figure references found in the text
-        return {
-            "sequential_order_of_unique_figures": True,  # Vacuously true as no figures to be out of order
-            "figure_count_unique": 0,
-            "missing_figures_in_sequence_to_max": [],
-            "figure_order_as_encountered": [],
-            "duplicate_references_to_same_figure_number": [],
-            "figures_mentioned_only_once": [] # New: No figures, so none are mentioned only once
-        }
-    # Get unique figure numbers, sorted
-    unique_sorted_figures: List[int] = sorted(list(set(valid_figure_numbers_int)))
-    # Check 1: Are the unique, sorted figures consecutive?
-    # e.g., [1, 2, 3] is sequential. [1, 3] is not. [2, 3, 4] is sequential by this definition.
-    is_sequential = True # Assume true initially
-    if len(unique_sorted_figures) > 1: # Only check if there's more than one unique figure
-        is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1]
-                            for i in range(len(unique_sorted_figures) - 1))
-    # Check 2: Missing figures in the sequence from 1 up to the highest figure number mentioned.
-    # This assumes figures should ideally start from 1 and be continuous up to the max.
-    missing_figures: List[int] = []
-    # max_found_figure will not error as unique_sorted_figures is non-empty at this point
-    max_found_figure = unique_sorted_figures[-1] # Since it's sorted and non-empty
-    expected_figures_up_to_max = set(range(1, max_found_figure + 1))
-    actual_figures_found_set = set(unique_sorted_figures)
-    missing_figures = sorted(list(expected_figures_up_to_max - actual_figures_found_set))
-    # Check 3: Count occurrences of each figure reference for duplicates and single mentions
     counts = Counter(valid_figure_numbers_int)
-    # Figures mentioned more than once (duplicates in terms of referencing the same figure number)
-    duplicate_refs: List[int] = sorted([num for num, count in counts.items() if count > 1])
-    # New Check: Figures mentioned exactly once.
-    # The requirement is "each figure should have atleast more than 1 mention".
-    # So, if a figure's mention count is 1, it fails this condition.
-    figures_mentioned_only_once: List[int] = sorted([
-        num for num, count in counts.items() if count == 1
-    ])
     return {
-        "sequential_order_of_unique_figures": is_sequential,
         "figure_count_unique": len(unique_sorted_figures),
-        "missing_figures_in_sequence_to_max": missing_figures,
-        "figure_order_as_encountered": valid_figure_numbers_int, # Original list of all found figure numbers in order of appearance
-        "duplicate_references_to_same_figure_number": duplicate_refs,
-        "figures_mentioned_only_once": figures_mentioned_only_once # NEWLY ADDED
     }
 def check_reference_order(plain_text: str) -> Dict[str, Any]:

 # Set JAVA_HOME environment variable (from target script)
 os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
 # --- Functions for PDF to Markdown to Plain Text ---
 def convert_markdown_to_plain_text(markdown_text: str) -> str:
         "abstract_structure": "structured abstract" in text_lower
     }
 def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
     """
     Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
     tool = None
     processed_issues: List[Dict[str, Any]] = []
     try:
+        tool = language_tool_python.LanguageTool('en-US')
         raw_lt_matches = tool.check(text_for_analysis)
         lt_issues_in_range = 0
         for idx, match in enumerate(raw_lt_matches):
+            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue
             # Filter by content boundaries
             if not (content_start_index <= match.offset < content_end_index):
                 continue
             lt_issues_in_range +=1
+            context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
             processed_issues.append({
                 '_internal_id': f"lt_{idx}",
                 'ruleId': match.ruleId,
 def check_figure_order(plain_text: str) -> Dict[str, Any]:
     figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
+    figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
+    valid_figure_numbers_int = []
+    for num_str in figure_references_str:
         if num_str.isdigit():
             valid_figure_numbers_int.append(int(num_str))
+    unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
+    is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))
+    missing_figures = []
+    if unique_sorted_figures:
+        expected_figures = set(range(1, max(unique_sorted_figures) + 1))
+        missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
     counts = Counter(valid_figure_numbers_int)
+    duplicate_refs = [num for num, count in counts.items() if count > 1]
     return {
+        "sequential_order_of_unique_figures": is_sequential,
         "figure_count_unique": len(unique_sorted_figures),
+        "missing_figures_in_sequence_to_max": missing_figures,
+        "figure_order_as_encountered": valid_figure_numbers_int,
+        "duplicate_references_to_same_figure_number": duplicate_refs
     }
 def check_reference_order(plain_text: str) -> Dict[str, Any]: