File size: 10,602 Bytes
966e948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# content_analysis.py
import re
from typing import List, Dict, Any
from collections import Counter
import language_tool_python
import traceback

# Import utility from text_utils
from text_utils import convert_markdown_to_plain_text

def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
    return {term: term.lower() in full_text.lower() for term in search_terms}

def label_authors(full_text: str) -> str:
    author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
    match = re.search(author_line_regex, full_text, re.MULTILINE)
    if match:
        authors = match.group(1).strip()
        return full_text.replace(authors, f"Authors: {authors}")
    return full_text

def check_metadata(plain_text: str) -> Dict[str, Any]:
    return {
        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', plain_text)),
        "list_of_authors": bool(re.search(r'Authors?:', plain_text, re.IGNORECASE)),
        "keywords_list": bool(re.search(r'Keywords?:', plain_text, re.IGNORECASE)),
        "word_count": len(plain_text.split()) or "Missing"
    }

def check_disclosures(plain_text: str) -> Dict[str, bool]:
    search_terms = [
        "conflict of interest statement",
        "ethics statement",
        "funding statement",
        "data access statement"
    ]
    results = check_text_presence(plain_text, search_terms)
    has_author_contribution = ("author contribution statement" in plain_text.lower() or
                               "author contributions statement" in plain_text.lower())
    results["author contribution statement"] = has_author_contribution
    return results

def check_figures_and_tables(plain_text: str) -> Dict[str, bool]:
    return {
        "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', plain_text, re.IGNORECASE)),
        "figures_legends": bool(re.search(r'Figure \d+.*?legend', plain_text, re.IGNORECASE)),
        "tables_legends": bool(re.search(r'Table \d+.*?legend', plain_text, re.IGNORECASE))
    }

def check_references_summary(plain_text: str) -> Dict[str, Any]:
    abstract_candidate = plain_text[:2000] 
    return {
        "old_references": bool(re.search(r'\b19[0-9]{2}\b', plain_text)), 
        "citations_in_abstract": bool(re.search(r'\[\d+\]', abstract_candidate, re.IGNORECASE)) or \
                                 bool(re.search(r'\bcit(?:ation|ed)\b', abstract_candidate, re.IGNORECASE)),
        "reference_count": len(re.findall(r'\[\d+(?:,\s*\d+)*\]', plain_text)), 
        "self_citations": bool(re.search(r'Self-citation', plain_text, re.IGNORECASE)) 
    }

def check_structure(plain_text: str) -> Dict[str, bool]:
    text_lower = plain_text.lower()
    return {
        "imrad_structure": all(section.lower() in text_lower for section in ["introduction", "method", "result", "discussion"]),
        "abstract_structure": "structured abstract" in text_lower
    }

def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
    if not markdown_text_from_pdf.strip():
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_pdf)
    text_for_analysis = plain_text_from_markdown.replace('\n', ' ')
    text_for_analysis = re.sub(r'\s+', ' ', text_for_analysis).strip()

    if not text_for_analysis:
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    text_for_analysis_lower = text_for_analysis.lower()
    
    abstract_match = re.search(r'\babstract\b', text_for_analysis_lower)
    content_start_index = abstract_match.start() if abstract_match else 0
    if abstract_match: print(f"Found 'abstract' at index {content_start_index}")
    else: print(f"Did not find 'abstract', starting language analysis from index 0")

    references_match = re.search(r'\breferences\b', text_for_analysis_lower)
    bibliography_match = re.search(r'\bbibliography\b', text_for_analysis_lower)
    content_end_index = len(text_for_analysis)

    if references_match and bibliography_match:
        content_end_index = min(references_match.start(), bibliography_match.start())
        print(f"Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
    elif references_match:
        content_end_index = references_match.start()
        print(f"Found 'references' at {content_end_index}. Using it as end boundary.")
    elif bibliography_match:
        content_end_index = bibliography_match.start()
        print(f"Found 'bibliography' at {content_end_index}. Using it as end boundary.")
    else:
        print(f"Did not find 'references' or 'bibliography'. Language analysis up to end of text (index {content_end_index}).")

    if content_start_index >= content_end_index:
        print(f"Warning: Content start index ({content_start_index}) is not before content end index ({content_end_index}). No language issues will be reported from this range.")
    
    tool = None
    processed_issues: List[Dict[str, Any]] = []
    try:
        tool = language_tool_python.LanguageTool('en-US') 
        raw_lt_matches = tool.check(text_for_analysis)
        
        lt_issues_in_range = 0
        for idx, match in enumerate(raw_lt_matches):
            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue
            if not (content_start_index <= match.offset < content_end_index): continue
            lt_issues_in_range +=1
            context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
            processed_issues.append({
                '_internal_id': f"lt_{idx}", 'ruleId': match.ruleId, 'message': match.message,
                'context_text': context_str, 'offset_in_text': match.offset, 'error_length': match.errorLength,
                'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                'category_name': match.category, 'is_mapped_to_pdf': False,
                'pdf_coordinates_list': [], 'mapped_page_number': -1
            })
        print(f"LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range.")
        
        regex_pattern = r'\b(\w+)\[(\d+)\]'
        regex_matches = list(re.finditer(regex_pattern, text_for_analysis))
        
        regex_issues_in_range = 0
        for reg_idx, match in enumerate(regex_matches):
            if not (content_start_index <= match.start() < content_end_index): continue
            regex_issues_in_range += 1
            word = match.group(1); number = match.group(2)
            processed_issues.append({
                '_internal_id': f"regex_{reg_idx}", 'ruleId': "SPACE_BEFORE_BRACKET",
                'message': f"Missing space before '[' in '{word}[{number}]'. Should be '{word} [{number}]'.",
                'context_text': text_for_analysis[match.start():match.end()],
                'offset_in_text': match.start(), 'error_length': match.end() - match.start(),
                'replacements_suggestion': [f"{word} [{number}]"], 'category_name': "Formatting",
                'is_mapped_to_pdf': False, 'pdf_coordinates_list': [], 'mapped_page_number': -1
            })
        print(f"Regex check found {len(regex_matches)} raw matches, {regex_issues_in_range} issues within defined content range.")
        
        return {
            "total_issues": len(processed_issues), "issues_list": processed_issues,
            "text_used_for_analysis": text_for_analysis 
        }
    except Exception as e:
        print(f"Error in check_language_issues_and_regex: {e}")
        traceback.print_exc()
        return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_analysis}
    finally:
        if tool: tool.close()

def check_figure_order(plain_text: str) -> Dict[str, Any]:
    figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
    figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
    
    valid_figure_numbers_int = [int(num_str) for num_str in figure_references_str if num_str.isdigit()]
    
    unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
    is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))

    missing_figures = []
    if unique_sorted_figures:
        expected_figures = set(range(1, max(unique_sorted_figures) + 1))
        missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
    
    counts = Counter(valid_figure_numbers_int)
    duplicate_refs = [num for num, count in counts.items() if count > 1]
    
    return {
        "sequential_order_of_unique_figures": is_sequential, 
        "figure_count_unique": len(unique_sorted_figures),
        "missing_figures_in_sequence_to_max": missing_figures, 
        "figure_order_as_encountered": valid_figure_numbers_int,
        "duplicate_references_to_same_figure_number": duplicate_refs
    }

def check_reference_order(plain_text: str) -> Dict[str, Any]:
    reference_pattern = r'\[(\d+)\]' 
    references_str = re.findall(reference_pattern, plain_text)
    ref_numbers_int = [int(ref) for ref in references_str if ref.isdigit()]
    
    max_ref_val = 0
    out_of_order_details = []
    
    if ref_numbers_int:
        max_ref_val = max(ref_numbers_int)
        current_max_seen_in_text = 0
        for i, ref in enumerate(ref_numbers_int):
            if ref < current_max_seen_in_text : 
                 out_of_order_details.append({
                     "position_in_text_occurrences": i + 1, "value": ref,
                     "previous_max_value_seen": current_max_seen_in_text,
                     "message": f"Reference [{ref}] appeared after a higher reference [{current_max_seen_in_text}] was already cited."
                 })
            current_max_seen_in_text = max(current_max_seen_in_text, ref)

    all_expected_refs_up_to_max = set(range(1, max_ref_val + 1)) if max_ref_val > 0 else set()
    used_refs_set = set(ref_numbers_int)
    missing_refs_in_sequence_to_max = sorted(list(all_expected_refs_up_to_max - used_refs_set))
    
    is_ordered_in_text = all(ref_numbers_int[i] <= ref_numbers_int[i+1] for i in range(len(ref_numbers_int)-1))

    return {
        "max_reference_number_cited": max_ref_val,
        "out_of_order_citations_details": out_of_order_details, 
        "missing_references_up_to_max_cited": missing_refs_in_sequence_to_max,
        "is_citation_order_non_decreasing_in_text": is_ordered_in_text
    }