File size: 6,594 Bytes
8b15528 2c6cadb 8b15528 2c6cadb 8b15528 2c6cadb 8b15528 2c6cadb 8b15528 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# language_checker.py
import re
import traceback
from typing import List, Dict, Any
import language_tool_python
from text_utils import convert_markdown_to_plain_text
# config.py (setting JAVA_HOME) should be imported early in app.py
def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
"""
Performs LanguageTool checks on plain text derived from font-filtered Markdown.
Filters issues to only include those between "abstract" and "references/bibliography"
found within this specific text.
"""
if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
print("LT_Checker: Input Markdown text is empty.")
return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_filtered_pdf)
text_for_lt_analysis = plain_text_from_markdown.replace('\n', ' ')
text_for_lt_analysis = re.sub(r'\s+', ' ', text_for_lt_analysis).strip()
if not text_for_lt_analysis:
print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
text_for_lt_analysis_lower = text_for_lt_analysis.lower()
abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
content_start_index = abstract_match.start() if abstract_match else 0
if abstract_match:
print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
else:
print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
# Determine end boundary (references or bibliography)
references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
content_end_index = len(text_for_lt_analysis)
if references_match and bibliography_match:
content_end_index = min(references_match.start(), bibliography_match.start())
print(f"LT_Checker: Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
elif references_match:
content_end_index = references_match.start()
print(f"LT_Checker: Found 'references' at {content_end_index}. Using it as end boundary.")
elif bibliography_match:
content_end_index = bibliography_match.start()
print(f"LT_Checker: Found 'bibliography' at {content_end_index}. Using it as end boundary.")
else:
print(f"LT_Checker: Did not find 'references' or 'bibliography'. LT analysis up to end of its text (index {content_end_index}).")
if content_start_index >= content_end_index:
print(f"LT_Checker: Warning: Content start index ({content_start_index}) is not before end index ({content_end_index}) in its text. No LT issues will be reported from this range.")
tool = None
processed_lt_issues: List[Dict[str, Any]] = []
try:
tool = language_tool_python.LanguageTool('en-US')
raw_lt_matches = tool.check(text_for_lt_analysis)
lt_issues_in_range = 0
for idx, match in enumerate(raw_lt_matches):
if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
if not (content_start_index <= match.offset < content_end_index):
continue
lt_issues_in_range += 1
# Current context extraction:
# context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
# New context extraction for ~10 words:
words_around = 1 # Number of words to try and get on each side
# Text before the error
pre_error_text = text_for_lt_analysis[:match.offset]
words_before = pre_error_text.split()[-words_around:]
# Text of the error itself
error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
# Text after the error
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
words_after = post_error_text.split()[:words_around]
# Combine to form the new wider context
context_parts = []
if words_before:
context_parts.append(" ".join(words_before))
context_parts.append(error_text) # The actual error phrase
if words_after:
context_parts.append(" ".join(words_after))
wider_context_str = " ".join(context_parts)
# Ensure there's a small buffer around the error to make it ~10 words total if error is short
# This can be refined further based on average word length or by counting words more precisely.
# A simpler approach using character offsets could also be used, e.g.:
# context_start_char = max(0, match.offset - 50) # Approx 50 chars before
# context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
# wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
processed_lt_issues.append({
'_internal_id': f"lt_{idx}",
'ruleId': match.ruleId,
'message': match.message,
'context_text': wider_context_str, # Use the new wider context
'offset_in_text': match.offset,
'error_length': match.errorLength,
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
'category_name': match.category,
'source_check_type': 'LanguageTool',
'is_mapped_to_pdf': False,
'pdf_coordinates_list': [],
'mapped_page_number': -1
})
print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
return {
"total_issues": len(processed_lt_issues),
"issues_list": processed_lt_issues,
"text_used_for_analysis": text_for_lt_analysis
}
except Exception as e:
print(f"Error in perform_language_checks: {e}\n{traceback.format_exc()}")
return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_lt_analysis}
finally:
if tool:
tool.close() |