code alignment
#2
by
samyak152002
- opened
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: pink
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
4 |
colorFrom: pink
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.29.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
CHANGED
@@ -19,10 +19,6 @@ import gradio as gr
|
|
19 |
# Set JAVA_HOME environment variable (from target script)
|
20 |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
21 |
|
22 |
-
global_constants = {
|
23 |
-
"CONTEXT_LENGTH" : 3
|
24 |
-
}
|
25 |
-
|
26 |
|
27 |
# --- Functions for PDF to Markdown to Plain Text ---
|
28 |
def convert_markdown_to_plain_text(markdown_text: str) -> str:
|
@@ -179,7 +175,6 @@ def check_structure(plain_text: str) -> Dict[str, bool]:
|
|
179 |
"abstract_structure": "structured abstract" in text_lower
|
180 |
}
|
181 |
|
182 |
-
|
183 |
def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
|
184 |
"""
|
185 |
Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
|
@@ -233,31 +228,20 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
233 |
|
234 |
tool = None
|
235 |
processed_issues: List[Dict[str, Any]] = []
|
236 |
-
|
237 |
try:
|
238 |
-
|
239 |
-
tool = language_tool_python.LanguageTool('en-US')
|
240 |
-
print(text_for_analysis)
|
241 |
raw_lt_matches = tool.check(text_for_analysis)
|
242 |
-
|
243 |
-
# Define a set of rule IDs to ignore
|
244 |
-
rules_to_ignore = {
|
245 |
-
"EN_SPLIT_WORDS_HYPHEN", # Existing rule to ignore
|
246 |
-
"MORFOLOGIK_RULE_EN_US" # New rule to ignore for spelling mistakes
|
247 |
-
}
|
248 |
-
|
249 |
lt_issues_in_range = 0
|
250 |
for idx, match in enumerate(raw_lt_matches):
|
251 |
-
|
252 |
-
|
253 |
-
continue
|
254 |
-
|
255 |
# Filter by content boundaries
|
256 |
if not (content_start_index <= match.offset < content_end_index):
|
257 |
continue
|
258 |
lt_issues_in_range +=1
|
259 |
|
260 |
-
context_str = text_for_analysis[match.offset
|
261 |
processed_issues.append({
|
262 |
'_internal_id': f"lt_{idx}",
|
263 |
'ruleId': match.ruleId,
|
@@ -314,69 +298,30 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
314 |
|
315 |
def check_figure_order(plain_text: str) -> Dict[str, Any]:
|
316 |
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
# Only include if the captured string is indeed a digit.
|
322 |
-
valid_figure_numbers_int: List[int] = []
|
323 |
-
for num_str in figure_references_raw:
|
324 |
if num_str.isdigit():
|
325 |
valid_figure_numbers_int.append(int(num_str))
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
if not valid_figure_numbers_int:
|
331 |
-
# No valid figure references found in the text
|
332 |
-
return {
|
333 |
-
"sequential_order_of_unique_figures": True, # Vacuously true as no figures to be out of order
|
334 |
-
"figure_count_unique": 0,
|
335 |
-
"missing_figures_in_sequence_to_max": [],
|
336 |
-
"figure_order_as_encountered": [],
|
337 |
-
"duplicate_references_to_same_figure_number": [],
|
338 |
-
"figures_mentioned_only_once": [] # New: No figures, so none are mentioned only once
|
339 |
-
}
|
340 |
|
341 |
-
|
342 |
-
unique_sorted_figures:
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
is_sequential = True # Assume true initially
|
347 |
-
if len(unique_sorted_figures) > 1: # Only check if there's more than one unique figure
|
348 |
-
is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1]
|
349 |
-
for i in range(len(unique_sorted_figures) - 1))
|
350 |
-
|
351 |
-
# Check 2: Missing figures in the sequence from 1 up to the highest figure number mentioned.
|
352 |
-
# This assumes figures should ideally start from 1 and be continuous up to the max.
|
353 |
-
missing_figures: List[int] = []
|
354 |
-
# max_found_figure will not error as unique_sorted_figures is non-empty at this point
|
355 |
-
max_found_figure = unique_sorted_figures[-1] # Since it's sorted and non-empty
|
356 |
-
expected_figures_up_to_max = set(range(1, max_found_figure + 1))
|
357 |
-
actual_figures_found_set = set(unique_sorted_figures)
|
358 |
-
missing_figures = sorted(list(expected_figures_up_to_max - actual_figures_found_set))
|
359 |
-
|
360 |
-
# Check 3: Count occurrences of each figure reference for duplicates and single mentions
|
361 |
counts = Counter(valid_figure_numbers_int)
|
362 |
-
|
363 |
-
|
364 |
-
duplicate_refs: List[int] = sorted([num for num, count in counts.items() if count > 1])
|
365 |
-
|
366 |
-
# New Check: Figures mentioned exactly once.
|
367 |
-
# The requirement is "each figure should have atleast more than 1 mention".
|
368 |
-
# So, if a figure's mention count is 1, it fails this condition.
|
369 |
-
figures_mentioned_only_once: List[int] = sorted([
|
370 |
-
num for num, count in counts.items() if count == 1
|
371 |
-
])
|
372 |
-
|
373 |
return {
|
374 |
-
"sequential_order_of_unique_figures": is_sequential,
|
375 |
"figure_count_unique": len(unique_sorted_figures),
|
376 |
-
"missing_figures_in_sequence_to_max": missing_figures,
|
377 |
-
"figure_order_as_encountered": valid_figure_numbers_int,
|
378 |
-
"duplicate_references_to_same_figure_number": duplicate_refs
|
379 |
-
"figures_mentioned_only_once": figures_mentioned_only_once # NEWLY ADDED
|
380 |
}
|
381 |
|
382 |
def check_reference_order(plain_text: str) -> Dict[str, Any]:
|
|
|
19 |
# Set JAVA_HOME environment variable (from target script)
|
20 |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# --- Functions for PDF to Markdown to Plain Text ---
|
24 |
def convert_markdown_to_plain_text(markdown_text: str) -> str:
|
|
|
175 |
"abstract_structure": "structured abstract" in text_lower
|
176 |
}
|
177 |
|
|
|
178 |
def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
|
179 |
"""
|
180 |
Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
|
|
|
228 |
|
229 |
tool = None
|
230 |
processed_issues: List[Dict[str, Any]] = []
|
|
|
231 |
try:
|
232 |
+
tool = language_tool_python.LanguageTool('en-US')
|
|
|
|
|
233 |
raw_lt_matches = tool.check(text_for_analysis)
|
234 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
lt_issues_in_range = 0
|
236 |
for idx, match in enumerate(raw_lt_matches):
|
237 |
+
if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue
|
238 |
+
|
|
|
|
|
239 |
# Filter by content boundaries
|
240 |
if not (content_start_index <= match.offset < content_end_index):
|
241 |
continue
|
242 |
lt_issues_in_range +=1
|
243 |
|
244 |
+
context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
|
245 |
processed_issues.append({
|
246 |
'_internal_id': f"lt_{idx}",
|
247 |
'ruleId': match.ruleId,
|
|
|
298 |
|
299 |
def check_figure_order(plain_text: str) -> Dict[str, Any]:
|
300 |
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
|
301 |
+
figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
|
302 |
+
|
303 |
+
valid_figure_numbers_int = []
|
304 |
+
for num_str in figure_references_str:
|
|
|
|
|
|
|
305 |
if num_str.isdigit():
|
306 |
valid_figure_numbers_int.append(int(num_str))
|
307 |
+
|
308 |
+
unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
|
309 |
+
is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
+
missing_figures = []
|
312 |
+
if unique_sorted_figures:
|
313 |
+
expected_figures = set(range(1, max(unique_sorted_figures) + 1))
|
314 |
+
missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
|
315 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
counts = Counter(valid_figure_numbers_int)
|
317 |
+
duplicate_refs = [num for num, count in counts.items() if count > 1]
|
318 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
return {
|
320 |
+
"sequential_order_of_unique_figures": is_sequential,
|
321 |
"figure_count_unique": len(unique_sorted_figures),
|
322 |
+
"missing_figures_in_sequence_to_max": missing_figures,
|
323 |
+
"figure_order_as_encountered": valid_figure_numbers_int,
|
324 |
+
"duplicate_references_to_same_figure_number": duplicate_refs
|
|
|
325 |
}
|
326 |
|
327 |
def check_reference_order(plain_text: str) -> Dict[str, Any]:
|