File size: 6,594 Bytes
8b15528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c6cadb
 
8b15528
 
2c6cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b15528
 
 
 
 
2c6cadb
8b15528
 
 
 
 
 
2c6cadb
8b15528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# language_checker.py
import re
import traceback
from typing import List, Dict, Any
import language_tool_python

from text_utils import convert_markdown_to_plain_text
# config.py (setting JAVA_HOME) should be imported early in app.py

def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
    """
    Performs LanguageTool checks on plain text derived from font-filtered Markdown.
    Filters issues to only include those between "abstract" and "references/bibliography"
    found within this specific text.
    """
    if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
        print("LT_Checker: Input Markdown text is empty.")
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_filtered_pdf)
    text_for_lt_analysis = plain_text_from_markdown.replace('\n', ' ')
    text_for_lt_analysis = re.sub(r'\s+', ' ', text_for_lt_analysis).strip()

    if not text_for_lt_analysis:
        print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    text_for_lt_analysis_lower = text_for_lt_analysis.lower()
    
    abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
    content_start_index = abstract_match.start() if abstract_match else 0
    if abstract_match:
        print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
    else:
        print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")

    # Determine end boundary (references or bibliography)
    references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
    bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
    content_end_index = len(text_for_lt_analysis)

    if references_match and bibliography_match:
        content_end_index = min(references_match.start(), bibliography_match.start())
        print(f"LT_Checker: Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
    elif references_match:
        content_end_index = references_match.start()
        print(f"LT_Checker: Found 'references' at {content_end_index}. Using it as end boundary.")
    elif bibliography_match:
        content_end_index = bibliography_match.start()
        print(f"LT_Checker: Found 'bibliography' at {content_end_index}. Using it as end boundary.")
    else:
        print(f"LT_Checker: Did not find 'references' or 'bibliography'. LT analysis up to end of its text (index {content_end_index}).")

    if content_start_index >= content_end_index:
        print(f"LT_Checker: Warning: Content start index ({content_start_index}) is not before end index ({content_end_index}) in its text. No LT issues will be reported from this range.")
    
    tool = None
    processed_lt_issues: List[Dict[str, Any]] = []
    try:
        tool = language_tool_python.LanguageTool('en-US') 
        raw_lt_matches = tool.check(text_for_lt_analysis)
        
        lt_issues_in_range = 0
        for idx, match in enumerate(raw_lt_matches):
            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue  # Common rule to ignore

            if not (content_start_index <= match.offset < content_end_index):
                continue
            lt_issues_in_range += 1

            # Current context extraction:
            # context_str = text_for_lt_analysis[match.offset : match.offset + match.errorLength]

            # New context extraction for ~10 words:
            words_around = 1  # Number of words to try and get on each side

            # Text before the error
            pre_error_text = text_for_lt_analysis[:match.offset]
            words_before = pre_error_text.split()[-words_around:]

            # Text of the error itself
            error_text = text_for_lt_analysis[match.offset: match.offset + match.errorLength]

            # Text after the error
            post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
            words_after = post_error_text.split()[:words_around]

            # Combine to form the new wider context
            context_parts = []
            if words_before:
                context_parts.append(" ".join(words_before))
            context_parts.append(error_text)  # The actual error phrase
            if words_after:
                context_parts.append(" ".join(words_after))

            wider_context_str = " ".join(context_parts)
            # Ensure there's a small buffer around the error to make it ~10 words total if error is short
            # This can be refined further based on average word length or by counting words more precisely.
            # A simpler approach using character offsets could also be used, e.g.:
            # context_start_char = max(0, match.offset - 50) # Approx 50 chars before
            # context_end_char = min(len(text_for_lt_analysis), match.offset + match.errorLength + 50) # Approx 50 chars after
            # wider_context_str = text_for_lt_analysis[context_start_char:context_end_char]

            processed_lt_issues.append({
                '_internal_id': f"lt_{idx}",
                'ruleId': match.ruleId,
                'message': match.message,
                'context_text': wider_context_str,  # Use the new wider context
                'offset_in_text': match.offset,
                'error_length': match.errorLength,
                'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                'category_name': match.category,
                'source_check_type': 'LanguageTool',
                'is_mapped_to_pdf': False,
                'pdf_coordinates_list': [],
                'mapped_page_number': -1
            })
        print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range of its text.")
        
        return {
            "total_issues": len(processed_lt_issues),
            "issues_list": processed_lt_issues,
            "text_used_for_analysis": text_for_lt_analysis 
        }
    except Exception as e:
        print(f"Error in perform_language_checks: {e}\n{traceback.format_exc()}")
        return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_lt_analysis}
    finally:
        if tool:
            tool.close()