samyak152002 commited on
Commit
8b15528
·
verified ·
1 Parent(s): eb20090

Create language_checker.py

Browse files
Files changed (1) hide show
  1. language_checker.py +98 -0
language_checker.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # language_checker.py
2
+ import re
3
+ import traceback
4
+ from typing import List, Dict, Any
5
+ import language_tool_python
6
+
7
+ from text_utils import convert_markdown_to_plain_text
8
+ # config.py (setting JAVA_HOME) should be imported early in app.py
9
+
10
+ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
11
+ """
12
+ Performs LanguageTool checks on plain text derived from font-filtered Markdown.
13
+ Filters issues to only include those between "abstract" and "references/bibliography"
14
+ found within this specific text.
15
+ """
16
+ if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
17
+ print("LT_Checker: Input Markdown text is empty.")
18
+ return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
19
+
20
+ plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_filtered_pdf)
21
+ text_for_lt_analysis = plain_text_from_markdown.replace('\n', ' ')
22
+ text_for_lt_analysis = re.sub(r'\s+', ' ', text_for_lt_analysis).strip()
23
+
24
+ if not text_for_lt_analysis:
25
+ print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
26
+ return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
27
+
28
+ text_for_lt_analysis_lower = text_for_lt_analysis.lower()
29
+
30
+ abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
31
+ content_start_index = abstract_match.start() if abstract_match else 0
32
+ if abstract_match:
33
+ print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
34
+ else:
35
+ print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
36
+
37
+ # Determine end boundary (references or bibliography)
38
+ references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
39
+ bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
40
+ content_end_index = len(text_for_lt_analysis)
41
+
42
+ if references_match and bibliography_match:
43
+ content_end_index = min(references_match.start(), bibliography_match.start())
44
+ print(f"LT_Checker: Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
45
+ elif references_match:
46
+ content_end_index = references_match.start()
47
+ print(f"LT_Checker: Found 'references' at {content_end_index}. Using it as end boundary.")
48
+ elif bibliography_match:
49
+ content_end_index = bibliography_match.start()
50
+ print(f"LT_Checker: Found 'bibliography' at {content_end_index}. Using it as end boundary.")
51
+ else:
52
+ print(f"LT_Checker: Did not find 'references' or 'bibliography'. LT analysis up to end of its text (index {content_end_index}).")
53
+
54
+ if content_start_index >= content_end_index:
55
+ print(f"LT_Checker: Warning: Content start index ({content_start_index}) is not before end index ({content_end_index}) in its text. No LT issues will be reported from this range.")
56
+
57
+ tool = None
58
+ processed_lt_issues: List[Dict[str, Any]] = []
59
+ try:
60
+ tool = language_tool_python.LanguageTool('en-US')
61
+ raw_lt_matches = tool.check(text_for_lt_analysis)
62
+
63
+ lt_issues_in_range = 0
64
+ for idx, match in enumerate(raw_lt_matches):
65
+ if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
66
+
67
+ if not (content_start_index <= match.offset < content_end_index):
68
+ continue
69
+ lt_issues_in_range +=1
70
+
71
+ context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
72
+ processed_lt_issues.append({
73
+ '_internal_id': f"lt_{idx}",
74
+ 'ruleId': match.ruleId,
75
+ 'message': match.message,
76
+ 'context_text': context_str,
77
+ 'offset_in_text': match.offset,
78
+ 'error_length': match.errorLength,
79
+ 'replacements_suggestion': match.replacements[:3] if match.replacements else [],
80
+ 'category_name': match.category,
81
+ 'source_check_type': 'LanguageTool',
82
+ 'is_mapped_to_pdf': False,
83
+ 'pdf_coordinates_list': [],
84
+ 'mapped_page_number': -1
85
+ })
86
+ print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
87
+
88
+ return {
89
+ "total_issues": len(processed_lt_issues),
90
+ "issues_list": processed_lt_issues,
91
+ "text_used_for_analysis": text_for_lt_analysis
92
+ }
93
+ except Exception as e:
94
+ print(f"Error in perform_language_checks: {e}\n{traceback.format_exc()}")
95
+ return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_lt_analysis}
96
+ finally:
97
+ if tool:
98
+ tool.close()