Update language_checker.py
Browse files- language_checker.py +5 -13
language_checker.py
CHANGED
@@ -68,8 +68,8 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
68 |
continue
|
69 |
lt_issues_in_range += 1
|
70 |
|
71 |
-
#
|
72 |
-
|
73 |
|
74 |
# New context extraction for ~10 words:
|
75 |
words_around = 1 # Number of words to try and get on each side
|
@@ -78,9 +78,6 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
78 |
pre_error_text = text_for_lt_analysis[:match.offset]
|
79 |
words_before = pre_error_text.split()[-words_around:]
|
80 |
|
81 |
-
# Text of the error itself
|
82 |
-
error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
|
83 |
-
|
84 |
# Text after the error
|
85 |
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
|
86 |
words_after = post_error_text.split()[:words_around]
|
@@ -89,23 +86,18 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
|
|
89 |
context_parts = []
|
90 |
if words_before:
|
91 |
context_parts.append(" ".join(words_before))
|
92 |
-
context_parts.append(
|
93 |
if words_after:
|
94 |
context_parts.append(" ".join(words_after))
|
95 |
|
96 |
wider_context_str = " ".join(context_parts)
|
97 |
-
# Ensure there's a small buffer around the error to make it ~10 words total if error is short
|
98 |
-
# This can be refined further based on average word length or by counting words more precisely.
|
99 |
-
# A simpler approach using character offsets could also be used, e.g.:
|
100 |
-
# context_start_char = max(0, match.offset - 50) # Approx 50 chars before
|
101 |
-
# context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
|
102 |
-
# wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
|
103 |
|
104 |
processed_lt_issues.append({
|
105 |
'_internal_id': f"lt_{idx}",
|
106 |
'ruleId': match.ruleId,
|
107 |
'message': match.message,
|
108 |
-
'context_text': wider_context_str,
|
|
|
109 |
'offset_in_text': match.offset,
|
110 |
'error_length': match.errorLength,
|
111 |
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
|
|
|
68 |
continue
|
69 |
lt_issues_in_range += 1
|
70 |
|
71 |
+
# Text of the error itself
|
72 |
+
error_text_verbatim = match.matchedText # The actual text that LanguageTool flagged
|
73 |
|
74 |
# New context extraction for ~10 words:
|
75 |
words_around = 1 # Number of words to try and get on each side
|
|
|
78 |
pre_error_text = text_for_lt_analysis[:match.offset]
|
79 |
words_before = pre_error_text.split()[-words_around:]
|
80 |
|
|
|
|
|
|
|
81 |
# Text after the error
|
82 |
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
|
83 |
words_after = post_error_text.split()[:words_around]
|
|
|
86 |
context_parts = []
|
87 |
if words_before:
|
88 |
context_parts.append(" ".join(words_before))
|
89 |
+
context_parts.append(error_text_verbatim) # The actual error phrase
|
90 |
if words_after:
|
91 |
context_parts.append(" ".join(words_after))
|
92 |
|
93 |
wider_context_str = " ".join(context_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
processed_lt_issues.append({
|
96 |
'_internal_id': f"lt_{idx}",
|
97 |
'ruleId': match.ruleId,
|
98 |
'message': match.message,
|
99 |
+
'context_text': wider_context_str,
|
100 |
+
'error_text_verbatim': error_text_verbatim, # Store the verbatim error text
|
101 |
'offset_in_text': match.offset,
|
102 |
'error_length': match.errorLength,
|
103 |
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
|