samyak152002 commited on
Commit
3770ab0
·
verified ·
1 Parent(s): 82c3ba5

Update language_checker.py

Browse files
Files changed (1) hide show
  1. language_checker.py +5 -13
language_checker.py CHANGED
@@ -68,8 +68,8 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
68
  continue
69
  lt_issues_in_range += 1
70
 
71
- # Current context extraction:
72
- # context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]
73
 
74
  # New context extraction for ~10 words:
75
  words_around = 1 # Number of words to try and get on each side
@@ -78,9 +78,6 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
78
  pre_error_text = text_for_lt_analysis[:match.offset]
79
  words_before = pre_error_text.split()[-words_around:]
80
 
81
- # Text of the error itself
82
- error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]
83
-
84
  # Text after the error
85
  post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
86
  words_after = post_error_text.split()[:words_around]
@@ -89,23 +86,18 @@ def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, A
89
  context_parts = []
90
  if words_before:
91
  context_parts.append(" ".join(words_before))
92
- context_parts.append(error_text) # The actual error phrase
93
  if words_after:
94
  context_parts.append(" ".join(words_after))
95
 
96
  wider_context_str = " ".join(context_parts)
97
- # Ensure there's a small buffer around the error to make it ~10 words total if error is short
98
- # This can be refined further based on average word length or by counting words more precisely.
99
- # A simpler approach using character offsets could also be used, e.g.:
100
- # context_start_char = max(0, match.offset - 50) # Approx 50 chars before
101
- # context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
102
- # wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]
103
 
104
  processed_lt_issues.append({
105
  '_internal_id': f"lt_{idx}",
106
  'ruleId': match.ruleId,
107
  'message': match.message,
108
- 'context_text': wider_context_str, # Use the new wider context
 
109
  'offset_in_text': match.offset,
110
  'error_length': match.errorLength,
111
  'replacements_suggestion': match.replacements[:3] if match.replacements else [],
 
68
  continue
69
  lt_issues_in_range += 1
70
 
71
+ # Text of the error itself
72
+ error_text_verbatim = match.matchedText # The actual text that LanguageTool flagged
73
 
74
  # New context extraction for ~10 words:
75
  words_around = 1 # Number of words to try and get on each side
 
78
  pre_error_text = text_for_lt_analysis[:match.offset]
79
  words_before = pre_error_text.split()[-words_around:]
80
 
 
 
 
81
  # Text after the error
82
  post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
83
  words_after = post_error_text.split()[:words_around]
 
86
  context_parts = []
87
  if words_before:
88
  context_parts.append(" ".join(words_before))
89
+ context_parts.append(error_text_verbatim) # The actual error phrase
90
  if words_after:
91
  context_parts.append(" ".join(words_after))
92
 
93
  wider_context_str = " ".join(context_parts)
 
 
 
 
 
 
94
 
95
  processed_lt_issues.append({
96
  '_internal_id': f"lt_{idx}",
97
  'ruleId': match.ruleId,
98
  'message': match.message,
99
+ 'context_text': wider_context_str,
100
+ 'error_text_verbatim': error_text_verbatim, # Store the verbatim error text
101
  'offset_in_text': match.offset,
102
  'error_length': match.errorLength,
103
  'replacements_suggestion': match.replacements[:3] if match.replacements else [],