File size: 5,695 Bytes
966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 fab5be2 966e948 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# content_analysis.py
import re
from typing import List, Dict, Any
from collections import Counter
def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
full_text_lower = full_text.lower()
return {term: term.lower() in full_text_lower for term in search_terms}
def check_metadata(plain_text: str) -> Dict[str, Any]:
word_count_val = len(plain_text.split())
return {
"author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', plain_text)),
"list_of_authors": bool(re.search(r'Authors?:', plain_text, re.IGNORECASE)),
"keywords_list": bool(re.search(r'Keywords?:', plain_text, re.IGNORECASE)),
"word_count": word_count_val if word_count_val > 0 else "Missing"
}
def check_disclosures(plain_text: str) -> Dict[str, bool]:
search_terms = [
"conflict of interest statement", "ethics statement",
"funding statement", "data access statement"
]
plain_text_lower = plain_text.lower()
results = {term: term in plain_text_lower for term in search_terms}
results["author contribution statement"] = (
"author contribution statement" in plain_text_lower or
"author contributions statement" in plain_text_lower
)
return results
def check_figures_and_tables(plain_text: str) -> Dict[str, bool]:
return {
"figures_with_citations": bool(re.search(r'Figure \d+.*?citation', plain_text, re.IGNORECASE)),
"figures_legends": bool(re.search(r'Figure \d+.*?legend', plain_text, re.IGNORECASE)),
"tables_legends": bool(re.search(r'Table \d+.*?legend', plain_text, re.IGNORECASE))
}
def check_references_summary(plain_text: str) -> Dict[str, Any]:
abstract_candidate = plain_text[:2500] # Slightly larger window for abstract
return {
"old_references": bool(re.search(r'\b19[0-9]{2}\b', plain_text)),
"citations_in_abstract": bool(re.search(r'\[\d+\]', abstract_candidate)) or \
bool(re.search(r'\bcit(?:ation|ed)\b', abstract_candidate, re.IGNORECASE)),
"reference_count": len(re.findall(r'\[\d+(?:,\s*\d+)*\]', plain_text)),
"self_citations": bool(re.search(r'Self-citation', plain_text, re.IGNORECASE))
}
def check_structure(plain_text: str) -> Dict[str, bool]:
text_lower = plain_text.lower()
imrad_present = all(section in text_lower for section in ["introduction", "method", "result", "discussion"])
# A more robust IMRAD check might look for these as section headers
return {
"imrad_structure": imrad_present,
"abstract_structure": "structured abstract" in text_lower
}
def check_figure_order(plain_text: str) -> Dict[str, Any]:
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
valid_figure_numbers_int = [int(num_str) for num_str in figure_references_str if num_str.isdigit()]
unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
is_sequential = True
if len(unique_sorted_figures) > 1:
is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))
missing_figures = []
if unique_sorted_figures:
max_fig = max(unique_sorted_figures)
expected_figures = set(range(1, max_fig + 1))
missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
counts = Counter(valid_figure_numbers_int)
duplicate_refs = [num for num, count in counts.items() if count > 1]
return {
"sequential_order_of_unique_figures": is_sequential,
"figure_count_unique": len(unique_sorted_figures),
"missing_figures_in_sequence_to_max": missing_figures,
"figure_order_as_encountered": valid_figure_numbers_int,
"duplicate_references_to_same_figure_number": duplicate_refs
}
def check_reference_order(plain_text: str) -> Dict[str, Any]:
reference_pattern = r'\[(\d+)\]'
references_str = re.findall(reference_pattern, plain_text)
ref_numbers_int = [int(ref) for ref in references_str if ref.isdigit() and int(ref) > 0] # Ensure ref > 0
max_ref_val = 0
out_of_order_details = []
is_ordered_in_text = True # Assume ordered unless proven otherwise
if ref_numbers_int:
max_ref_val = max(ref_numbers_int)
current_max_seen_in_text = 0
for i, ref in enumerate(ref_numbers_int):
if ref < current_max_seen_in_text: # Check against actual max seen so far
out_of_order_details.append({
"position_in_text_occurrences": i + 1, "value": ref,
"previous_max_value_seen": current_max_seen_in_text,
"message": f"Reference [{ref}] appeared after a higher reference [{current_max_seen_in_text}] was already cited."
})
current_max_seen_in_text = max(current_max_seen_in_text, ref)
if len(ref_numbers_int) > 1:
is_ordered_in_text = all(ref_numbers_int[i] <= ref_numbers_int[i+1] for i in range(len(ref_numbers_int)-1))
all_expected_refs_up_to_max = set(range(1, max_ref_val + 1)) if max_ref_val > 0 else set()
used_refs_set = set(ref_numbers_int)
missing_refs_in_sequence_to_max = sorted(list(all_expected_refs_up_to_max - used_refs_set))
return {
"max_reference_number_cited": max_ref_val,
"out_of_order_citations_details": out_of_order_details,
"missing_references_up_to_max_cited": missing_refs_in_sequence_to_max,
"is_citation_order_non_decreasing_in_text": is_ordered_in_text
} |