texmetrics-regex-checks-gradio-1-devtesting

Running

texmetrics-regex-checks-gradio-1-devtesting

File size: 5,695 Bytes

966e948
 
 
 
 
 
fab5be2
 
966e948
 
fab5be2
966e948
 
 
 
fab5be2
966e948
 
 
 
fab5be2
 
966e948
fab5be2
 
 
 
 
 
966e948
 
 
 
 
 
 
 
 
 
fab5be2
966e948
fab5be2
 
966e948
fab5be2
966e948
 
 
 
 
fab5be2
 
966e948
fab5be2
966e948
 
 
 
 
 
 
 
 
 
fab5be2
 
 
966e948
 
 
fab5be2
 
966e948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fab5be2
966e948
 
 
fab5be2
966e948
 
 
 
 
fab5be2
966e948
 
 
 
 
 
fab5be2
 
 
966e948

# content_analysis.py
import re
from typing import List, Dict, Any
from collections import Counter

def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
    full_text_lower = full_text.lower()
    return {term: term.lower() in full_text_lower for term in search_terms}

def check_metadata(plain_text: str) -> Dict[str, Any]:
    word_count_val = len(plain_text.split())
    return {
        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', plain_text)),
        "list_of_authors": bool(re.search(r'Authors?:', plain_text, re.IGNORECASE)),
        "keywords_list": bool(re.search(r'Keywords?:', plain_text, re.IGNORECASE)),
        "word_count": word_count_val if word_count_val > 0 else "Missing"
    }

def check_disclosures(plain_text: str) -> Dict[str, bool]:
    search_terms = [
        "conflict of interest statement", "ethics statement",
        "funding statement", "data access statement"
    ]
    plain_text_lower = plain_text.lower()
    results = {term: term in plain_text_lower for term in search_terms}
    results["author contribution statement"] = (
        "author contribution statement" in plain_text_lower or
        "author contributions statement" in plain_text_lower
    )
    return results

def check_figures_and_tables(plain_text: str) -> Dict[str, bool]:
    return {
        "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', plain_text, re.IGNORECASE)),
        "figures_legends": bool(re.search(r'Figure \d+.*?legend', plain_text, re.IGNORECASE)),
        "tables_legends": bool(re.search(r'Table \d+.*?legend', plain_text, re.IGNORECASE))
    }

def check_references_summary(plain_text: str) -> Dict[str, Any]:
    abstract_candidate = plain_text[:2500] # Slightly larger window for abstract
    return {
        "old_references": bool(re.search(r'\b19[0-9]{2}\b', plain_text)),
        "citations_in_abstract": bool(re.search(r'\[\d+\]', abstract_candidate)) or \
                                 bool(re.search(r'\bcit(?:ation|ed)\b', abstract_candidate, re.IGNORECASE)),
        "reference_count": len(re.findall(r'\[\d+(?:,\s*\d+)*\]', plain_text)),
        "self_citations": bool(re.search(r'Self-citation', plain_text, re.IGNORECASE)) 
    }

def check_structure(plain_text: str) -> Dict[str, bool]:
    text_lower = plain_text.lower()
    imrad_present = all(section in text_lower for section in ["introduction", "method", "result", "discussion"])
    # A more robust IMRAD check might look for these as section headers
    return {
        "imrad_structure": imrad_present,
        "abstract_structure": "structured abstract" in text_lower
    }

def check_figure_order(plain_text: str) -> Dict[str, Any]:
    figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
    figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
    
    valid_figure_numbers_int = [int(num_str) for num_str in figure_references_str if num_str.isdigit()]
    
    unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
    is_sequential = True
    if len(unique_sorted_figures) > 1:
        is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))

    missing_figures = []
    if unique_sorted_figures:
        max_fig = max(unique_sorted_figures)
        expected_figures = set(range(1, max_fig + 1))
        missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
    
    counts = Counter(valid_figure_numbers_int)
    duplicate_refs = [num for num, count in counts.items() if count > 1]
    
    return {
        "sequential_order_of_unique_figures": is_sequential, 
        "figure_count_unique": len(unique_sorted_figures),
        "missing_figures_in_sequence_to_max": missing_figures, 
        "figure_order_as_encountered": valid_figure_numbers_int,
        "duplicate_references_to_same_figure_number": duplicate_refs
    }

def check_reference_order(plain_text: str) -> Dict[str, Any]:
    reference_pattern = r'\[(\d+)\]' 
    references_str = re.findall(reference_pattern, plain_text)
    ref_numbers_int = [int(ref) for ref in references_str if ref.isdigit() and int(ref) > 0] # Ensure ref > 0
    
    max_ref_val = 0
    out_of_order_details = []
    is_ordered_in_text = True # Assume ordered unless proven otherwise
    
    if ref_numbers_int:
        max_ref_val = max(ref_numbers_int)
        current_max_seen_in_text = 0
        for i, ref in enumerate(ref_numbers_int):
            if ref < current_max_seen_in_text: # Check against actual max seen so far
                 out_of_order_details.append({
                     "position_in_text_occurrences": i + 1, "value": ref,
                     "previous_max_value_seen": current_max_seen_in_text,
                     "message": f"Reference [{ref}] appeared after a higher reference [{current_max_seen_in_text}] was already cited."
                 })
            current_max_seen_in_text = max(current_max_seen_in_text, ref)
        
        if len(ref_numbers_int) > 1:
             is_ordered_in_text = all(ref_numbers_int[i] <= ref_numbers_int[i+1] for i in range(len(ref_numbers_int)-1))

    all_expected_refs_up_to_max = set(range(1, max_ref_val + 1)) if max_ref_val > 0 else set()
    used_refs_set = set(ref_numbers_int)
    missing_refs_in_sequence_to_max = sorted(list(all_expected_refs_up_to_max - used_refs_set))
    
    return {
        "max_reference_number_cited": max_ref_val,
        "out_of_order_citations_details": out_of_order_details, 
        "missing_references_up_to_max_cited": missing_refs_in_sequence_to_max,
        "is_citation_order_non_decreasing_in_text": is_ordered_in_text
    }