|
import re |
|
import fitz |
|
import language_tool_python |
|
from typing import List, Dict, Any, Tuple |
|
from collections import Counter |
|
import json |
|
import traceback |
|
import io |
|
import tempfile |
|
import os |
|
import gradio as gr |
|
|
|
|
|
if 'JAVA_HOME' not in os.environ: |
|
potential_java_homes = [ |
|
'/usr/lib/jvm/java-11-openjdk-amd64', |
|
'/usr/lib/jvm/java-17-openjdk-amd64', |
|
|
|
|
|
|
|
|
|
] |
|
|
|
user_java_home = os.environ.get('USER_JAVA_HOME_CONFIG') |
|
if user_java_home and os.path.exists(user_java_home): |
|
potential_java_homes.insert(0, user_java_home) |
|
|
|
for jh in potential_java_homes: |
|
if os.path.exists(jh): |
|
os.environ['JAVA_HOME'] = jh |
|
print(f"Set JAVA_HOME to: {jh}") |
|
break |
|
if 'JAVA_HOME' not in os.environ: |
|
print("Warning: JAVA_HOME not found or set. LanguageTool might fail.") |
|
print("Please set JAVA_HOME environment variable to your JDK (version 11+) installation path,") |
|
print("or ensure your LanguageTool setup (e.g., remote server) does not require it locally.") |
|
|
|
|
|
|
|
|
|
|
|
def extract_pdf_text_for_general_checks(file_path_or_stream) -> str: |
|
"""Extracts full text from a PDF file using PyMuPDF4LLM for general regex checks.""" |
|
temp_file_path_holder = [] |
|
pdf_path_for_pymupdf4llm = None |
|
try: |
|
if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream): |
|
pdf_path_for_pymupdf4llm = file_path_or_stream |
|
elif hasattr(file_path_or_stream, 'read'): |
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file: |
|
file_path_or_stream.seek(0) |
|
temp_file.write(file_path_or_stream.read()) |
|
pdf_path_for_pymupdf4llm = temp_file.name |
|
temp_file_path_holder.append(pdf_path_for_pymupdf4llm) |
|
elif isinstance(file_path_or_stream, bytes): |
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file: |
|
temp_file.write(file_path_or_stream) |
|
pdf_path_for_pymupdf4llm = temp_file.name |
|
temp_file_path_holder.append(pdf_path_for_pymupdf4llm) |
|
else: |
|
print(f"Unsupported input type for PyMuPDF4LLM: {type(file_path_or_stream)}") |
|
return "" |
|
|
|
if not pdf_path_for_pymupdf4llm: |
|
print("PDF path could not be determined for PyMuPDF4LLM.") |
|
return "" |
|
|
|
import pymupdf4llm |
|
full_text = pymupdf4llm.to_markdown(pdf_path_for_pymupdf4llm) |
|
return full_text |
|
|
|
except Exception as e: |
|
print(f"Error extracting text with PyMuPDF4LLM: {str(e)}") |
|
return "" |
|
finally: |
|
if temp_file_path_holder: |
|
try: |
|
os.remove(temp_file_path_holder[0]) |
|
except OSError as e_os: |
|
print(f"Warning: Could not remove temp file {temp_file_path_holder[0]}: {e_os}") |
|
|
|
|
|
def extract_word_data_and_text_for_lt(file_path_or_stream) -> Tuple[str, List[Dict[str, Any]]]: |
|
doc = None |
|
try: |
|
if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream): |
|
doc = fitz.open(file_path_or_stream) |
|
elif hasattr(file_path_or_stream, 'read'): |
|
file_path_or_stream.seek(0) |
|
doc = fitz.open(stream=file_path_or_stream.read(), filetype="pdf") |
|
elif isinstance(file_path_or_stream, bytes): |
|
doc = fitz.open(stream=file_path_or_stream, filetype="pdf") |
|
else: |
|
print(f"Unsupported input type for extract_word_data_and_text_for_lt: {type(file_path_or_stream)}") |
|
return "", [] |
|
except Exception as e: |
|
print(f"Error opening PDF in extract_word_data_and_text_for_lt: {e}") |
|
return "", [] |
|
|
|
word_coords_data_intermediate = [] |
|
for page_idx, page in enumerate(doc): |
|
|
|
words_on_page = page.get_text("words", sort=True) |
|
for w_info in words_on_page: |
|
word_text = w_info[4] |
|
word_rect = fitz.Rect(w_info[0:4]) |
|
if word_text.strip(): |
|
word_coords_data_intermediate.append({ |
|
'text': word_text, |
|
'page_num': page_idx, |
|
'rect': word_rect, |
|
}) |
|
doc.close() |
|
|
|
text_for_lt = " ".join([item['text'] for item in word_coords_data_intermediate]) |
|
|
|
word_coords_data_final = [] |
|
current_char_pos_recalc = 0 |
|
for i, item_data in enumerate(word_coords_data_intermediate): |
|
final_item = item_data.copy() |
|
final_item['start_offset'] = current_char_pos_recalc |
|
word_coords_data_final.append(final_item) |
|
|
|
current_char_pos_recalc += len(final_item['text']) |
|
if i < len(word_coords_data_intermediate) - 1: |
|
current_char_pos_recalc += 1 |
|
|
|
return text_for_lt, word_coords_data_final |
|
|
|
|
|
def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]: |
|
return {term: term.lower() in full_text.lower() for term in search_terms} |
|
|
|
def check_metadata(full_text: str) -> Dict[str, Any]: |
|
return { |
|
"author_email_present": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)), |
|
"authors_list_heading_present": bool(re.search(r'(?:Authors?|AUTHORS?):\s*', full_text)), |
|
"keywords_list_heading_present": bool(re.search(r'(?:Keywords?|KEYWORDS?):\s*', full_text, re.IGNORECASE)), |
|
"word_count_estimate": len(full_text.split()) if full_text else "Missing" |
|
} |
|
|
|
def check_disclosures(full_text: str) -> Dict[str, bool]: |
|
search_terms = [ |
|
"conflict of interest statement", "COI statement", |
|
"ethics statement", "ethical approval", |
|
"funding statement", "acknowledgment of funding", |
|
"data availability statement", "data access statement" |
|
] |
|
|
|
results = {} |
|
results["conflict_of_interest_statement"] = any(term.lower() in full_text.lower() for term in search_terms[0:2]) |
|
results["ethics_statement"] = any(term.lower() in full_text.lower() for term in search_terms[2:4]) |
|
results["funding_statement"] = any(term.lower() in full_text.lower() for term in search_terms[4:6]) |
|
results["data_access_statement"] = any(term.lower() in full_text.lower() for term in search_terms[6:8]) |
|
|
|
has_author_contribution = ("author contribution" in full_text.lower() or |
|
"authors contribution" in full_text.lower()) |
|
results["author_contribution_statement"] = has_author_contribution |
|
return results |
|
|
|
def check_figures_and_tables_overview(full_text: str) -> Dict[str, bool]: |
|
return { |
|
"figures_mentioned": bool(re.search(r'Fig(?:ure)?\s*\d+', full_text, re.IGNORECASE)), |
|
"tables_mentioned": bool(re.search(r'Table\s*\d+', full_text, re.IGNORECASE)), |
|
} |
|
|
|
def check_references_overview(full_text: str) -> Dict[str, Any]: |
|
has_references_section = bool(re.search(r"^\s*(?:References|Bibliography)\s*$", full_text, re.IGNORECASE | re.MULTILINE)) |
|
citations_in_text = re.findall(r'\[\d+(?:,\s*\d+)*(?:–\d+)?\]', full_text) |
|
|
|
reference_list_items = [] |
|
if has_references_section: |
|
match_ref_sec = re.search(r"^\s*(?:References|Bibliography)\s*$(.*)", full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL) |
|
if match_ref_sec: |
|
references_text_block = match_ref_sec.group(1) |
|
reference_list_items = re.findall(r"^\s*(?:\[\d+\]|\d+\.\s)", references_text_block, re.MULTILINE) |
|
|
|
return { |
|
"references_section_heading_present": has_references_section, |
|
"citations_in_text_count": len(citations_in_text), |
|
"reference_list_items_count_heuristic": len(reference_list_items), |
|
"old_references_present_pre_1995": bool(re.search(r'\b(?:19[0-8]\d|199[0-4])\b', full_text)), |
|
} |
|
|
|
def check_structure_overview(full_text: str) -> Dict[str, bool]: |
|
imrad_sections = ["Introduction", "Methods", "Materials and Methods", "Results", "Discussion"] |
|
imrad_found_count = 0 |
|
if any(re.search(rf"^\s*Introduction\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[0]]): imrad_found_count+=1 |
|
if any(re.search(rf"^\s*(?:Methods|Materials\s+and\s+Methods)\b", full_text, re.IGNORECASE | re.MULTILINE) for s in imrad_sections[1:3]): imrad_found_count+=1 |
|
if any(re.search(rf"^\s*Results\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[3]]): imrad_found_count+=1 |
|
if any(re.search(rf"^\s*Discussion\b", full_text, re.IGNORECASE | re.MULTILINE) for s in [imrad_sections[4]]): imrad_found_count+=1 |
|
|
|
has_abstract_section = bool(re.search(r"^\s*Abstract\b", full_text, re.IGNORECASE | re.MULTILINE)) |
|
|
|
return { |
|
"abstract_section_heading_present": has_abstract_section, |
|
"imrad_structure_partially_present": imrad_found_count >=3, |
|
"imrad_sections_detected_count": imrad_found_count |
|
} |
|
|
|
def check_language_issues(text_for_lt: str) -> Dict[str, Any]: |
|
try: |
|
tool_path = os.environ.get('LT_PATH') |
|
|
|
|
|
|
|
language_tool = language_tool_python.LanguageTool( |
|
'en-US', |
|
remote_server='http://localhost:8081' if not tool_path else None, |
|
language_tool_path=tool_path if tool_path else None |
|
) |
|
matches = language_tool.check(text_for_lt) |
|
issues = [] |
|
|
|
for match in matches: |
|
|
|
if match.ruleId in ["EN_SPLIT_WORDS_HYPHEN", "UPPERCASE_SENTENCE_START", "MORFOLOGIK_RULE_EN_US"]: |
|
continue |
|
issues.append({ |
|
"message": match.message, |
|
"context": match.context.strip(), |
|
"error_text_segment": match.context[match.contextOffset : match.contextOffset + match.errorLength], |
|
"suggestions": match.replacements[:3] if match.replacements else [], |
|
"category": match.category, |
|
"rule_id": match.ruleId, |
|
"offset": match.offset, |
|
"length": match.errorLength, |
|
}) |
|
|
|
regex_pattern = r'\b(\w+)\[(\d+)\]' |
|
regex_matches = list(re.finditer(regex_pattern, text_for_lt)) |
|
|
|
for match_re in regex_matches: |
|
word = match_re.group(1) |
|
number = match_re.group(2) |
|
issues.append({ |
|
"message": f"Missing space before '[' in '{word}[{number}]'. Suggestion: '{word} [{number}]'.", |
|
"context": text_for_lt[max(match_re.start() - 40, 0):min(match_re.end() + 40, len(text_for_lt))].strip(), |
|
"error_text_segment": match_re.group(0), |
|
"suggestions": [f"{word} [{number}]"], |
|
"category": "Formatting", |
|
"rule_id": "MISSING_SPACE_BEFORE_BRACKET_CITATION", |
|
"offset": match_re.start(), |
|
"length": match_re.end() - match_re.start(), |
|
}) |
|
|
|
return {"total_issues": len(issues), "issues": issues} |
|
except ConnectionRefusedError: |
|
error_msg = "LanguageTool Error: Connection to LT server (e.g., http://localhost:8081) refused. Ensure it's running, or configure LT_PATH for local JAR usage." |
|
print(error_msg) |
|
return {"error": error_msg, "issues": []} |
|
except Exception as e: |
|
error_msg = f"Error checking language issues: {type(e).__name__} - {e}" |
|
print(error_msg) |
|
|
|
return {"error": error_msg, "issues": []} |
|
|
|
|
|
def check_figure_table_order(full_text: str) -> Dict[str, Any]: |
|
fig_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)' |
|
fig_refs_in_order = [int(num) for num in re.findall(fig_pattern, full_text, re.IGNORECASE)] |
|
|
|
tbl_pattern = r'Table\s*(\d+)' |
|
tbl_refs_in_order = [int(num) for num in re.findall(tbl_pattern, full_text, re.IGNORECASE)] |
|
|
|
def analyze_numbering(refs_list, item_type="Item"): |
|
issues = [] |
|
if not refs_list: |
|
return {"references_in_order_of_appearance": [], f"{item_type}_numbering_issues": ["Not mentioned."]} |
|
|
|
unique_sorted_refs = sorted(list(set(refs_list))) |
|
max_ref_num = unique_sorted_refs[-1] if unique_sorted_refs else 0 |
|
|
|
|
|
expected_sequence = list(range(1, max_ref_num + 1)) |
|
missing_numbers = [num for num in expected_sequence if num not in unique_sorted_refs] |
|
if missing_numbers: |
|
issues.append(f"Missing {item_type.lower()}(s) in sequence up to {max_ref_num}: {missing_numbers}") |
|
|
|
|
|
|
|
|
|
|
|
first_occurrence_map = {} |
|
unique_refs_in_appearance_order = [] |
|
for ref_num in refs_list: |
|
if ref_num not in first_occurrence_map: |
|
first_occurrence_map[ref_num] = True |
|
unique_refs_in_appearance_order.append(ref_num) |
|
|
|
if unique_refs_in_appearance_order != sorted(unique_refs_in_appearance_order): |
|
issues.append(f"{item_type}s may not be first mentioned in strict numerical order. Sequence of first mentions: {unique_refs_in_appearance_order}") |
|
|
|
return { |
|
"references_in_order_of_appearance": refs_list, |
|
f"{item_type.lower()}_numbering_issues": issues if issues else ["Appears OK based on simple checks."] |
|
} |
|
|
|
fig_analysis = analyze_numbering(fig_refs_in_order, "Figure") |
|
tbl_analysis = analyze_numbering(tbl_refs_in_order, "Table") |
|
|
|
return {**fig_analysis, **tbl_analysis} |
|
|
|
|
|
|
|
|
|
|
|
def highlight_issues_in_pdf( |
|
pdf_file_or_stream, |
|
word_coords_data: List[Dict[str, Any]], |
|
language_issues_list: List[Dict[str, Any]] |
|
|
|
) -> Tuple[List[Dict[str, Any]], bytes]: |
|
|
|
doc = None |
|
try: |
|
if isinstance(pdf_file_or_stream, str) and os.path.exists(pdf_file_or_stream): |
|
doc = fitz.open(pdf_file_or_stream) |
|
elif hasattr(pdf_file_or_stream, 'read'): |
|
pdf_file_or_stream.seek(0) |
|
doc = fitz.open(stream=pdf_file_or_stream.read(), filetype="pdf") |
|
elif isinstance(pdf_file_or_stream, bytes): |
|
doc = fitz.open(stream=pdf_file_or_stream, filetype="pdf") |
|
else: |
|
print(f"Unsupported PDF input type in highlight_issues_in_pdf: {type(pdf_file_or_stream)}") |
|
return language_issues_list, b"" |
|
except Exception as e: |
|
print(f"Error opening PDF in highlight_issues_in_pdf: {e}") |
|
return language_issues_list, b"" |
|
|
|
issues_with_coords_and_page = [] |
|
|
|
for issue_details in language_issues_list: |
|
issue_offset = issue_details["offset"] |
|
issue_length = issue_details["length"] |
|
error_text_to_search = issue_details["error_text_segment"] |
|
|
|
current_issue_output = issue_details.copy() |
|
current_issue_output["page"] = 0 |
|
current_issue_output["coordinates"] = [] |
|
|
|
candidate_pdf_words_info = [] |
|
for word_info in word_coords_data: |
|
word_start_offset = word_info['start_offset'] |
|
word_end_offset = word_start_offset + len(word_info['text']) |
|
if word_start_offset < (issue_offset + issue_length) and issue_offset < word_end_offset: |
|
candidate_pdf_words_info.append(word_info) |
|
|
|
if not candidate_pdf_words_info: |
|
issues_with_coords_and_page.append(current_issue_output) |
|
continue |
|
|
|
page_num_for_issue = candidate_pdf_words_info[0]["page_num"] |
|
page_to_search_on = doc[page_num_for_issue] |
|
|
|
clip_search_rect = fitz.Rect(candidate_pdf_words_info[0]['rect']) |
|
for i in range(1, len(candidate_pdf_words_info)): |
|
clip_search_rect.include_rect(candidate_pdf_words_info[i]['rect']) |
|
|
|
clip_search_rect.x0 -= 3 |
|
clip_search_rect.y0 -= 3 |
|
clip_search_rect.x1 += 3 |
|
clip_search_rect.y1 += 3 |
|
clip_search_rect.normalize() |
|
|
|
found_rects_on_page = [] |
|
if error_text_to_search.strip(): |
|
try: |
|
|
|
|
|
found_rects_on_page = page_to_search_on.search_for(error_text_to_search, clip=clip_search_rect, quads=False) |
|
except Exception as search_e: |
|
print(f"PyMuPDF search_for error: '{search_e}' for text '{error_text_to_search}' on page {page_num_for_issue+1}. Skipping this highlight.") |
|
|
|
if found_rects_on_page: |
|
current_issue_output["page"] = page_num_for_issue + 1 |
|
|
|
overall_bounds = fitz.Rect(found_rects_on_page[0]) |
|
for r_idx in range(1, len(found_rects_on_page)): |
|
overall_bounds.include_rect(found_rects_on_page[r_idx]) |
|
current_issue_output["coordinates"] = [ |
|
round(overall_bounds.x0, 2), round(overall_bounds.y0, 2), |
|
round(overall_bounds.x1, 2), round(overall_bounds.y1, 2) |
|
] |
|
|
|
for rect_to_highlight in found_rects_on_page: |
|
if not rect_to_highlight.is_empty and rect_to_highlight.width > 0.1 and rect_to_highlight.height > 0.1: |
|
highlight_annot = page_to_search_on.add_highlight_annot(rect_to_highlight) |
|
if highlight_annot: |
|
highlight_annot.set_colors(stroke=(1, 1, 0)) |
|
highlight_annot.update(opacity=0.4) |
|
issues_with_coords_and_page.append(current_issue_output) |
|
|
|
output_pdf_bytes = io.BytesIO() |
|
try: |
|
doc.save(output_pdf_bytes, garbage=3, deflate=True) |
|
annotated_pdf_bytes_content = output_pdf_bytes.getvalue() |
|
except Exception as e: |
|
print(f"Error saving annotated PDF: {e}") |
|
annotated_pdf_bytes_content = b"" |
|
finally: |
|
doc.close() |
|
output_pdf_bytes.close() |
|
|
|
return issues_with_coords_and_page, annotated_pdf_bytes_content |
|
|
|
|
|
|
|
|
|
def analyze_pdf(pdf_input_data) -> Tuple[Dict[str, Any], bytes]: |
|
results = {"language_issues": [], "general_document_checks": {}, "analysis_errors": []} |
|
annotated_pdf_bytes = None |
|
|
|
|
|
input_bytes_content = None |
|
if hasattr(pdf_input_data, 'read'): |
|
pdf_input_data.seek(0) |
|
input_bytes_content = pdf_input_data.read() |
|
|
|
elif isinstance(pdf_input_data, bytes): |
|
input_bytes_content = pdf_input_data |
|
elif isinstance(pdf_input_data, str) and os.path.exists(pdf_input_data): |
|
with open(pdf_input_data, "rb") as f_path: |
|
input_bytes_content = f_path.read() |
|
else: |
|
results["analysis_errors"].append(f"Invalid PDF input data type: {type(pdf_input_data)}") |
|
return results, None |
|
|
|
if not input_bytes_content: |
|
results["analysis_errors"].append("PDF input data is empty or unreadable.") |
|
return results, None |
|
|
|
try: |
|
|
|
pdf_stream_for_general = io.BytesIO(input_bytes_content) |
|
full_text_for_general_checks = extract_pdf_text_for_general_checks(pdf_stream_for_general) |
|
pdf_stream_for_general.close() |
|
|
|
if full_text_for_general_checks: |
|
results["general_document_checks"] = { |
|
"metadata": check_metadata(full_text_for_general_checks), |
|
"disclosures": check_disclosures(full_text_for_general_checks), |
|
"figures_tables_overview": check_figures_and_tables_overview(full_text_for_general_checks), |
|
"references_overview": check_references_overview(full_text_for_general_checks), |
|
"structure_overview": check_structure_overview(full_text_for_general_checks), |
|
"figure_table_order": check_figure_table_order(full_text_for_general_checks), |
|
} |
|
else: |
|
results["analysis_errors"].append("Failed to extract text using PyMuPDF4LLM for general checks.") |
|
|
|
|
|
pdf_stream_for_lt = io.BytesIO(input_bytes_content) |
|
text_for_lt, word_coords_data = extract_word_data_and_text_for_lt(pdf_stream_for_lt) |
|
pdf_stream_for_lt.close() |
|
|
|
if not text_for_lt and not word_coords_data: |
|
results["analysis_errors"].append("Could not extract word data for language analysis and highlighting.") |
|
else: |
|
language_issues_result = check_language_issues(text_for_lt) |
|
if "error" in language_issues_result: |
|
results["analysis_errors"].append(f"Language check error: {language_issues_result['error']}") |
|
|
|
lt_issues_list = language_issues_result.get("issues", []) |
|
|
|
if lt_issues_list: |
|
pdf_stream_for_highlighting = io.BytesIO(input_bytes_content) |
|
updated_lt_issues_list, annotated_pdf_bytes = highlight_issues_in_pdf( |
|
pdf_stream_for_highlighting, |
|
word_coords_data, |
|
lt_issues_list |
|
) |
|
pdf_stream_for_highlighting.close() |
|
results["language_issues"] = updated_lt_issues_list |
|
else: |
|
results["language_issues"] = lt_issues_list |
|
|
|
if not results["analysis_errors"]: |
|
del results["analysis_errors"] |
|
|
|
if "issues" in results and "language_issues" not in results: |
|
results["language_issues"] = results.pop("issues") |
|
|
|
|
|
return results, annotated_pdf_bytes |
|
|
|
except Exception as e: |
|
error_msg = f"Critical error in analyze_pdf: {type(e).__name__} - {e}" |
|
print(error_msg) |
|
|
|
current_errors = results.get("analysis_errors", []) |
|
current_errors.append(error_msg) |
|
results["analysis_errors"] = current_errors |
|
return results, None |
|
|
|
|
|
|
|
|
|
def process_upload(file_bytes_from_gradio): |
|
if file_bytes_from_gradio is None: |
|
return json.dumps({"error_message": "No file uploaded"}, indent=2), None |
|
|
|
try: |
|
|
|
results, annotated_pdf_output_bytes = analyze_pdf(file_bytes_from_gradio) |
|
|
|
|
|
|
|
|
|
|
|
results_json = json.dumps(results, indent=2, ensure_ascii=False) |
|
|
|
if annotated_pdf_output_bytes: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_annotated_pdf_file: |
|
tmp_annotated_pdf_file.write(annotated_pdf_output_bytes) |
|
return results_json, tmp_annotated_pdf_file.name |
|
else: |
|
return results_json, None |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing file in Gradio interface: {type(e).__name__} - {e}" |
|
print(error_msg) |
|
|
|
return json.dumps({"error_message": error_msg}, indent=2), None |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(title="PDF Analyzer", theme=gr.themes.Glass()) as interface: |
|
gr.Markdown("# PDF Document Analyzer") |
|
gr.Markdown( |
|
"Upload a PDF to check for common manuscript issues. " |
|
"Language checks use LanguageTool (EN-US). Ensure your LanguageTool setup is correct " |
|
"(e.g., local server on port 8081, or LT_PATH environment variable for local JAR)." |
|
) |
|
|
|
with gr.Row(): |
|
file_input = gr.File( |
|
label="Upload PDF Document", |
|
file_types=[".pdf"], |
|
type="binary" |
|
) |
|
|
|
analyze_btn = gr.Button("Analyze PDF", variant="primary", scale=0) |
|
|
|
gr.Markdown("## Analysis Results") |
|
with gr.Tabs(): |
|
with gr.TabItem("Detailed Report"): |
|
results_output = gr.JSON(label="JSON Report", scale=2) |
|
with gr.TabItem("Annotated PDF"): |
|
|
|
pdf_output_display = gr.File(label="Download Annotated PDF (if issues were highlighted)", interactive=False) |
|
|
|
analyze_btn.click( |
|
fn=process_upload, |
|
inputs=[file_input], |
|
outputs=[results_output, pdf_output_display] |
|
) |
|
gr.Markdown("---") |
|
gr.Markdown("Developed with PyMuPDF, LanguageTool, and Gradio. Alpha version.") |
|
return interface |
|
|
|
if __name__ == "__main__": |
|
print("PDF Analyzer launching...") |
|
print("Ensure LanguageTool is accessible (e.g., server at http://localhost:8081 or LT_PATH set).") |
|
|
|
|
|
|
|
interface = create_interface() |
|
interface.launch( |
|
share=True, |
|
|
|
|
|
) |