File size: 10,102 Bytes
8b15528
 
 
 
 
36623de
8b15528
 
 
 
36623de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b15528
 
 
 
 
36623de
8b15528
 
 
 
 
 
 
 
 
 
 
 
 
36623de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b15528
 
 
 
36623de
8b15528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36623de
 
8b15528
2c6cadb
 
36623de
 
 
 
 
8b15528
 
2c6cadb
 
36623de
 
2c6cadb
 
 
 
 
36623de
 
 
2c6cadb
8b15528
 
 
 
 
3770ab0
36623de
8b15528
 
 
 
 
 
2c6cadb
8b15528
 
36623de
 
 
 
 
8b15528
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# language_checker.py
import re
import traceback
from typing import List, Dict, Any
import language_tool_python
import logging # For more persistent error messages

from text_utils import convert_markdown_to_plain_text
# config.py (setting JAVA_HOME) should be imported early in app.py

# Import SpanMarkerModel
try:
    from span_marker import SpanMarkerModel
    SPAN_MARKER_AVAILABLE = True
except ImportError:
    SPAN_MARKER_AVAILABLE = False
    SpanMarkerModel = None # Placeholder if not available
    print("LT_Checker: Warning: span_marker library not found. Acronym filtering will be disabled.")
    print("LT_Checker: Please install it via 'pip install span_marker'")


# --- Global SpanMarker Model for Acronyms ---
_span_marker_model_acronyms = None
_span_marker_model_loaded_successfully = False
_span_marker_model_load_attempted = False

SPAN_MARKER_ACRONYM_MODEL_NAME = "tomaarsen/span-marker-bert-base-uncased-acronyms"

def _load_span_marker_model_if_needed():
    global _span_marker_model_acronyms, _span_marker_model_loaded_successfully, _span_marker_model_load_attempted

    if not SPAN_MARKER_AVAILABLE or _span_marker_model_load_attempted:
        return

    _span_marker_model_load_attempted = True
    try:
        print(f"LT_Checker: Attempting to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' for acronym detection...")
        # Ensure you have torch installed, or the appropriate backend for SpanMarkerModel
        _span_marker_model_acronyms = SpanMarkerModel.from_pretrained(SPAN_MARKER_ACRONYM_MODEL_NAME)
        _span_marker_model_loaded_successfully = True
        print(f"LT_Checker: SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' loaded successfully.")
    except Exception as e:
        _span_marker_model_loaded_successfully = False
        print(f"LT_Checker: CRITICAL ERROR loading SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}")
        print(f"LT_Checker: Acronym filtering will be disabled. Please check your installation and model availability.")
        logging.error(f"Failed to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}", exc_info=True)

# Attempt to load the model when the module is first imported.
# This might slightly delay the initial import if the model is large.
_load_span_marker_model_if_needed()


def _is_text_acronym_related(text_to_check: str, acronym_entities: List[Dict[str, Any]]) -> bool:
    """
    Checks if the text_to_check contains any of the acronyms (long or short form)
    identified by the SpanMarker model.
    """
    if not acronym_entities or not text_to_check:
        return False
    
    text_to_check_lower = text_to_check.lower()
    for entity in acronym_entities:
        acronym_span = entity.get('span', '')
        if acronym_span: # Ensure span is not empty
            # Check if the identified acronym span is present in the text flagged by LanguageTool
            if acronym_span.lower() in text_to_check_lower:
                # print(f"Debug AcronymFilter: Text '{text_to_check}' (from LT) contains detected acronym '{acronym_span}'. Filtering.")
                return True
    return False


def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
    """
    Performs LanguageTool checks on plain text derived from font-filtered Markdown.
    Filters issues to only include those between "abstract" and "references/bibliography"
    found within this specific text.
    Also filters out issues related to acronyms identified by SpanMarker.
    """
    if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
        print("LT_Checker: Input Markdown text is empty.")
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_filtered_pdf)
    text_for_lt_analysis = plain_text_from_markdown.replace('\n', ' ')
    text_for_lt_analysis = re.sub(r'\s+', ' ', text_for_lt_analysis).strip()

    if not text_for_lt_analysis:
        print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    # --- Acronym Detection using SpanMarker ---
    acronym_entities = []
    if _span_marker_model_loaded_successfully and _span_marker_model_acronyms:
        try:
            # print(f"LT_Checker: Running SpanMarker on text of length {len(text_for_lt_analysis)} for acronyms.")
            acronym_entities = _span_marker_model_acronyms.predict(text_for_lt_analysis)
            # if acronym_entities:
            #     print(f"LT_Checker: SpanMarker found {len(acronym_entities)} acronym entities. Examples: {[e['span'] for e in acronym_entities[:3]]}")
        except Exception as sm_e:
            print(f"LT_Checker: Error during SpanMarker prediction: {sm_e}")
            logging.warning(f"SpanMarker prediction failed: {sm_e}", exc_info=True)
            # Proceed without acronym filtering if prediction fails
            acronym_entities = []
    elif SPAN_MARKER_AVAILABLE and not _span_marker_model_loaded_successfully:
        print("LT_Checker: SpanMarker model was available but not loaded successfully. Acronym filtering disabled for this run.")


    text_for_lt_analysis_lower = text_for_lt_analysis.lower()
    
    abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
    content_start_index = abstract_match.start() if abstract_match else 0
    # ... (rest of abstract/references boundary logic as before) ...
    if abstract_match:
        print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
    else:
        print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")

    references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
    bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
    content_end_index = len(text_for_lt_analysis)

    if references_match and bibliography_match:
        content_end_index = min(references_match.start(), bibliography_match.start())
        print(f"LT_Checker: Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
    elif references_match:
        content_end_index = references_match.start()
        print(f"LT_Checker: Found 'references' at {content_end_index}. Using it as end boundary.")
    elif bibliography_match:
        content_end_index = bibliography_match.start()
        print(f"LT_Checker: Found 'bibliography' at {content_end_index}. Using it as end boundary.")
    else:
        print(f"LT_Checker: Did not find 'references' or 'bibliography'. LT analysis up to end of its text (index {content_end_index}).")

    if content_start_index >= content_end_index:
        print(f"LT_Checker: Warning: Content start index ({content_start_index}) is not before end index ({content_end_index}) in its text. No LT issues will be reported from this range.")
    
    tool = None
    processed_lt_issues: List[Dict[str, Any]] = []
    try:
        tool = language_tool_python.LanguageTool('en-US') 
        raw_lt_matches = tool.check(text_for_lt_analysis)
        
        lt_issues_in_range = 0
        filtered_acronym_issues = 0

        for idx, match in enumerate(raw_lt_matches):
            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue  # Common rule to ignore

            # --- Acronym Filtering Step ---
            if acronym_entities and _is_text_acronym_related(match.matchedText, acronym_entities):
                filtered_acronym_issues += 1
                continue # Skip this LanguageTool match as it's related to a detected acronym

            if not (content_start_index <= match.offset < content_end_index):
                continue
            lt_issues_in_range += 1

            error_text_verbatim = match.matchedText 
            words_around = 1
            pre_error_text = text_for_lt_analysis[:match.offset]
            words_before = pre_error_text.split()[-words_around:]
            post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
            words_after = post_error_text.split()[:words_around]
            context_parts = []
            if words_before: context_parts.append(" ".join(words_before))
            context_parts.append(error_text_verbatim)
            if words_after: context_parts.append(" ".join(words_after))
            wider_context_str = " ".join(context_parts)

            processed_lt_issues.append({
                '_internal_id': f"lt_{idx}",
                'ruleId': match.ruleId,
                'message': match.message,
                'context_text': wider_context_str,
                'error_text_verbatim': error_text_verbatim,
                'offset_in_text': match.offset,
                'error_length': match.errorLength,
                'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                'category_name': match.category,
                'source_check_type': 'LanguageTool',
                'is_mapped_to_pdf': False,
                'pdf_coordinates_list': [],
                'mapped_page_number': -1
            })
        
        print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues.")
        if acronym_entities:
            print(f"LT_Checker: Filtered out {filtered_acronym_issues} LT issues due to acronym detection.")
        print(f"LT_Checker: {lt_issues_in_range} LT issues within defined content range (after acronym filtering).")
        
        return {
            "total_issues": len(processed_lt_issues),
            "issues_list": processed_lt_issues,
            "text_used_for_analysis": text_for_lt_analysis 
        }
    except Exception as e:
        print(f"Error in perform_language_checks: {e}\n{traceback.format_exc()}")
        return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_lt_analysis}
    finally:
        if tool:
            tool.close()