code alignment

#2
by samyak152002 - opened
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +23 -78
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ†
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.31.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -19,10 +19,6 @@ import gradio as gr
19
  # Set JAVA_HOME environment variable (from target script)
20
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
21
 
22
- global_constants = {
23
- "CONTEXT_LENGTH" : 3
24
- }
25
-
26
 
27
  # --- Functions for PDF to Markdown to Plain Text ---
28
  def convert_markdown_to_plain_text(markdown_text: str) -> str:
@@ -179,7 +175,6 @@ def check_structure(plain_text: str) -> Dict[str, bool]:
179
  "abstract_structure": "structured abstract" in text_lower
180
  }
181
 
182
-
183
  def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
184
  """
185
  Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
@@ -233,31 +228,20 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
233
 
234
  tool = None
235
  processed_issues: List[Dict[str, Any]] = []
236
-
237
  try:
238
-
239
- tool = language_tool_python.LanguageTool('en-US')
240
- print(text_for_analysis)
241
  raw_lt_matches = tool.check(text_for_analysis)
242
-
243
- # Define a set of rule IDs to ignore
244
- rules_to_ignore = {
245
- "EN_SPLIT_WORDS_HYPHEN", # Existing rule to ignore
246
- "MORFOLOGIK_RULE_EN_US" # New rule to ignore for spelling mistakes
247
- }
248
-
249
  lt_issues_in_range = 0
250
  for idx, match in enumerate(raw_lt_matches):
251
- # Skip if the ruleId is in our set of ignored rules
252
- if match.ruleId in rules_to_ignore:
253
- continue
254
-
255
  # Filter by content boundaries
256
  if not (content_start_index <= match.offset < content_end_index):
257
  continue
258
  lt_issues_in_range +=1
259
 
260
- context_str = text_for_analysis[match.offset - global_constants["CONTEXT_LENGTH"] : match.offset + match.errorLength + global_constants["CONTEXT_LENGTH"]]
261
  processed_issues.append({
262
  '_internal_id': f"lt_{idx}",
263
  'ruleId': match.ruleId,
@@ -314,69 +298,30 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
314
 
315
  def check_figure_order(plain_text: str) -> Dict[str, Any]:
316
  figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
317
- # Find all matches; re.IGNORECASE ensures "figure", "Figure", "FIGURE" are caught
318
- figure_references_raw = re.findall(figure_pattern, plain_text, re.IGNORECASE)
319
-
320
- # Convert captured numbers (group 1 of the regex) to integers
321
- # Only include if the captured string is indeed a digit.
322
- valid_figure_numbers_int: List[int] = []
323
- for num_str in figure_references_raw:
324
  if num_str.isdigit():
325
  valid_figure_numbers_int.append(int(num_str))
326
- # else:
327
- # Optional: log or handle non-digit captures if the regex could allow them
328
- # print(f"Warning: Figure regex captured non-digit '{num_str}'")
329
-
330
- if not valid_figure_numbers_int:
331
- # No valid figure references found in the text
332
- return {
333
- "sequential_order_of_unique_figures": True, # Vacuously true as no figures to be out of order
334
- "figure_count_unique": 0,
335
- "missing_figures_in_sequence_to_max": [],
336
- "figure_order_as_encountered": [],
337
- "duplicate_references_to_same_figure_number": [],
338
- "figures_mentioned_only_once": [] # New: No figures, so none are mentioned only once
339
- }
340
 
341
- # Get unique figure numbers, sorted
342
- unique_sorted_figures: List[int] = sorted(list(set(valid_figure_numbers_int)))
343
-
344
- # Check 1: Are the unique, sorted figures consecutive?
345
- # e.g., [1, 2, 3] is sequential. [1, 3] is not. [2, 3, 4] is sequential by this definition.
346
- is_sequential = True # Assume true initially
347
- if len(unique_sorted_figures) > 1: # Only check if there's more than one unique figure
348
- is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1]
349
- for i in range(len(unique_sorted_figures) - 1))
350
-
351
- # Check 2: Missing figures in the sequence from 1 up to the highest figure number mentioned.
352
- # This assumes figures should ideally start from 1 and be continuous up to the max.
353
- missing_figures: List[int] = []
354
- # max_found_figure will not error as unique_sorted_figures is non-empty at this point
355
- max_found_figure = unique_sorted_figures[-1] # Since it's sorted and non-empty
356
- expected_figures_up_to_max = set(range(1, max_found_figure + 1))
357
- actual_figures_found_set = set(unique_sorted_figures)
358
- missing_figures = sorted(list(expected_figures_up_to_max - actual_figures_found_set))
359
-
360
- # Check 3: Count occurrences of each figure reference for duplicates and single mentions
361
  counts = Counter(valid_figure_numbers_int)
362
-
363
- # Figures mentioned more than once (duplicates in terms of referencing the same figure number)
364
- duplicate_refs: List[int] = sorted([num for num, count in counts.items() if count > 1])
365
-
366
- # New Check: Figures mentioned exactly once.
367
- # The requirement is "each figure should have atleast more than 1 mention".
368
- # So, if a figure's mention count is 1, it fails this condition.
369
- figures_mentioned_only_once: List[int] = sorted([
370
- num for num, count in counts.items() if count == 1
371
- ])
372
-
373
  return {
374
- "sequential_order_of_unique_figures": is_sequential,
375
  "figure_count_unique": len(unique_sorted_figures),
376
- "missing_figures_in_sequence_to_max": missing_figures,
377
- "figure_order_as_encountered": valid_figure_numbers_int, # Original list of all found figure numbers in order of appearance
378
- "duplicate_references_to_same_figure_number": duplicate_refs,
379
- "figures_mentioned_only_once": figures_mentioned_only_once # NEWLY ADDED
380
  }
381
 
382
  def check_reference_order(plain_text: str) -> Dict[str, Any]:
 
19
  # Set JAVA_HOME environment variable (from target script)
20
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
21
 
 
 
 
 
22
 
23
  # --- Functions for PDF to Markdown to Plain Text ---
24
  def convert_markdown_to_plain_text(markdown_text: str) -> str:
 
175
  "abstract_structure": "structured abstract" in text_lower
176
  }
177
 
 
178
  def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
179
  """
180
  Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
 
228
 
229
  tool = None
230
  processed_issues: List[Dict[str, Any]] = []
 
231
  try:
232
+ tool = language_tool_python.LanguageTool('en-US')
 
 
233
  raw_lt_matches = tool.check(text_for_analysis)
234
+
 
 
 
 
 
 
235
  lt_issues_in_range = 0
236
  for idx, match in enumerate(raw_lt_matches):
237
+ if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue
238
+
 
 
239
  # Filter by content boundaries
240
  if not (content_start_index <= match.offset < content_end_index):
241
  continue
242
  lt_issues_in_range +=1
243
 
244
+ context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
245
  processed_issues.append({
246
  '_internal_id': f"lt_{idx}",
247
  'ruleId': match.ruleId,
 
298
 
299
  def check_figure_order(plain_text: str) -> Dict[str, Any]:
300
  figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
301
+ figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
302
+
303
+ valid_figure_numbers_int = []
304
+ for num_str in figure_references_str:
 
 
 
305
  if num_str.isdigit():
306
  valid_figure_numbers_int.append(int(num_str))
307
+
308
+ unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
309
+ is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ missing_figures = []
312
+ if unique_sorted_figures:
313
+ expected_figures = set(range(1, max(unique_sorted_figures) + 1))
314
+ missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
315
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  counts = Counter(valid_figure_numbers_int)
317
+ duplicate_refs = [num for num, count in counts.items() if count > 1]
318
+
 
 
 
 
 
 
 
 
 
319
  return {
320
+ "sequential_order_of_unique_figures": is_sequential,
321
  "figure_count_unique": len(unique_sorted_figures),
322
+ "missing_figures_in_sequence_to_max": missing_figures,
323
+ "figure_order_as_encountered": valid_figure_numbers_int,
324
+ "duplicate_references_to_same_figure_number": duplicate_refs
 
325
  }
326
 
327
  def check_reference_order(plain_text: str) -> Dict[str, Any]: