texmetrics-regex-checks-gradio-1-testing

Sleeping

App Files Files Community

texmetrics-regex-checks-gradio-1-testing / app.py

samyak152002

Update app.py

f51a49c verified 3 months ago

raw

history blame contribute delete

27.9 kB

	import re
	import fitz # PyMuPDF
	import language_tool_python
	from typing import List, Dict, Any, Tuple
	from collections import Counter
	import json
	import traceback # Keep for debugging, but try to minimize in final user-facing JSON
	import io
	import tempfile
	import os
	import gradio as gr

	# Set JAVA_HOME environment variable
	if 'JAVA_HOME' not in os.environ:
	potential_java_homes = [
	'/usr/lib/jvm/java-11-openjdk-amd64',
	'/usr/lib/jvm/java-17-openjdk-amd64',
	# For macOS users with Homebrew OpenJDK (common paths):
	# '/opt/homebrew/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home', # M1/M2 Macs
	# '/usr/local/opt/openjdk@11/libexec/openjdk.jdk/Contents/Home', # Intel Macs
	# '/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home', # Default OpenJDK Homebrew
	]
	# User-specific path from environment if available
	user_java_home = os.environ.get('USER_JAVA_HOME_CONFIG') # Example custom env var
	if user_java_home and os.path.exists(user_java_home):
	potential_java_homes.insert(0, user_java_home)

	for jh in potential_java_homes:
	if os.path.exists(jh):
	os.environ['JAVA_HOME'] = jh
	print(f"Set JAVA_HOME to: {jh}")
	break
	if 'JAVA_HOME' not in os.environ:
	print("Warning: JAVA_HOME not found or set. LanguageTool might fail.")
	print("Please set JAVA_HOME environment variable to your JDK (version 11+) installation path,")
	print("or ensure your LanguageTool setup (e.g., remote server) does not require it locally.")

	# ------------------------------
	# Text Extraction & Analysis Functions
	# ------------------------------

	def extract_pdf_text_for_general_checks(file_path_or_stream) -> str:
	"""Extracts full text from a PDF file using PyMuPDF4LLM for general regex checks."""
	temp_file_path_holder = []
	pdf_path_for_pymupdf4llm = None
	try:
	if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream):
	pdf_path_for_pymupdf4llm = file_path_or_stream
	elif hasattr(file_path_or_stream, 'read'): # Gradio File(type="binary") gives bytes, wrapped in BytesIO
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
	file_path_or_stream.seek(0)
	temp_file.write(file_path_or_stream.read())
	pdf_path_for_pymupdf4llm = temp_file.name
	temp_file_path_holder.append(pdf_path_for_pymupdf4llm)
	elif isinstance(file_path_or_stream, bytes):
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
	temp_file.write(file_path_or_stream)
	pdf_path_for_pymupdf4llm = temp_file.name
	temp_file_path_holder.append(pdf_path_for_pymupdf4llm)
	else:
	print(f"Unsupported input type for PyMuPDF4LLM: {type(file_path_or_stream)}")
	return ""

	if not pdf_path_for_pymupdf4llm:
	print("PDF path could not be determined for PyMuPDF4LLM.")
	return ""

	import pymupdf4llm
	full_text = pymupdf4llm.to_markdown(pdf_path_for_pymupdf4llm)
	return full_text

	except Exception as e:
	print(f"Error extracting text with PyMuPDF4LLM: {str(e)}")
	return ""
	finally:
	if temp_file_path_holder:
	try:
	os.remove(temp_file_path_holder[0])
	except OSError as e_os:
	print(f"Warning: Could not remove temp file {temp_file_path_holder[0]}: {e_os}")


	def extract_word_data_and_text_for_lt(file_path_or_stream) -> Tuple[str, List[Dict[str, Any]]]:
	doc = None
	try:
	if isinstance(file_path_or_stream, str) and os.path.exists(file_path_or_stream):
	doc = fitz.open(file_path_or_stream)
	elif hasattr(file_path_or_stream, 'read'): # BytesIO or tempfile
	file_path_or_stream.seek(0)
	doc = fitz.open(stream=file_path_or_stream.read(), filetype="pdf")
	elif isinstance(file_path_or_stream, bytes):
	doc = fitz.open(stream=file_path_or_stream, filetype="pdf")
	else:
	print(f"Unsupported input type for extract_word_data_and_text_for_lt: {type(file_path_or_stream)}")
	return "", []
	except Exception as e:
	print(f"Error opening PDF in extract_word_data_and_text_for_lt: {e}")
	return "", []

	word_coords_data_intermediate = []
	for page_idx, page in enumerate(doc):
	# Using sort=True attempts to get words in reading order.
	words_on_page = page.get_text("words", sort=True)
	for w_info in words_on_page: # (x0, y0, x1, y1, "word", block_no, line_no, word_no)
	word_text = w_info[4]
	word_rect = fitz.Rect(w_info[0:4])
	if word_text.strip(): # Ensure word is not just whitespace
	word_coords_data_intermediate.append({
	'text': word_text,
	'page_num': page_idx,
	'rect': word_rect,
	})
	doc.close()

	text_for_lt = " ".join([item['text'] for item in word_coords_data_intermediate])

	word_coords_data_final = []
	current_char_pos_recalc = 0
	for i, item_data in enumerate(word_coords_data_intermediate):
	final_item = item_data.copy()
	final_item['start_offset'] = current_char_pos_recalc
	word_coords_data_final.append(final_item)

	current_char_pos_recalc += len(final_item['text'])
	if i < len(word_coords_data_intermediate) - 1: # Add 1 for the space
	current_char_pos_recalc += 1

	return text_for_lt, word_coords_data_final


	def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
	return {term: term.lower() in full_text.lower() for term in search_terms}

	def check_metadata(full_text: str) -> Dict[str, Any]:
	return {
	"author_email_present": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
	"authors_list_heading_present": bool(re.search(r'(?:Authors?\|AUTHORS?):\s*', full_text)),
	"keywords_list_heading_present": bool(re.search(r'(?:Keywords?\|KEYWORDS?):\s*', full_text, re.IGNORECASE)),
	"word_count_estimate": len(full_text.split()) if full_text else "Missing"
	}

	def check_disclosures(full_text: str) -> Dict[str, bool]:
	search_terms = [
	"conflict of interest statement", "COI statement",
	"ethics statement", "ethical approval",
	"funding statement", "acknowledgment of funding",
	"data availability statement", "data access statement"
	]
	# Use a more robust check by looking for variations and combining results
	results = {}
	results["conflict_of_interest_statement"] = any(term.lower() in full_text.lower() for term in search_terms[0:2])
	results["ethics_statement"] = any(term.lower() in full_text.lower() for term in search_terms[2:4])
	results["funding_statement"] = any(term.lower() in full_text.lower() for term in search_terms[4:6])
	results["data_access_statement"] = any(term.lower() in full_text.lower() for term in search_terms[6:8])

	has_author_contribution = ("author contribution" in full_text.lower() or # Catches singular and plural
	"authors contribution" in full_text.lower()) # Common variation
	results["author_contribution_statement"] = has_author_contribution
	return results

	def check_figures_and_tables_overview(full_text: str) -> Dict[str, bool]:
	return {
	"figures_mentioned": bool(re.search(r'Fig(?:ure)?\s*\d+', full_text, re.IGNORECASE)),
	"tables_mentioned": bool(re.search(r'Table\s*\d+', full_text, re.IGNORECASE)),
	}

	def check_references_overview(full_text: str) -> Dict[str, Any]:
	has_references_section = bool(re.search(r"^\s(?:References\|Bibliography)\s$", full_text, re.IGNORECASE \| re.MULTILINE))
	citations_in_text = re.findall(r'\[\d+(?:,\s\d+)(?:–\d+)?\]', full_text) # Matches [1], [1,2], [1-3], [1, 2-5]

	reference_list_items = []
	if has_references_section:
	match_ref_sec = re.search(r"^\s(?:References\|Bibliography)\s$(.*)", full_text, re.IGNORECASE \| re.MULTILINE \| re.DOTALL)
	if match_ref_sec:
	references_text_block = match_ref_sec.group(1)
	reference_list_items = re.findall(r"^\s*(?:\[\d+\]\|\d+\.\s)", references_text_block, re.MULTILINE) # [1] or 1.

	return {
	"references_section_heading_present": has_references_section,
	"citations_in_text_count": len(citations_in_text),
	"reference_list_items_count_heuristic": len(reference_list_items),
	"old_references_present_pre_1995": bool(re.search(r'\b(?:19[0-8]\d\|199[0-4])\b', full_text)),
	}

	def check_structure_overview(full_text: str) -> Dict[str, bool]:
	imrad_sections = ["Introduction", "Methods", "Materials and Methods", "Results", "Discussion"]
	imrad_found_count = 0
	if any(re.search(rf"^\s*Introduction\b", full_text, re.IGNORECASE \| re.MULTILINE) for s in [imrad_sections[0]]): imrad_found_count+=1
	if any(re.search(rf"^\s*(?:Methods\|Materials\s+and\s+Methods)\b", full_text, re.IGNORECASE \| re.MULTILINE) for s in imrad_sections[1:3]): imrad_found_count+=1
	if any(re.search(rf"^\s*Results\b", full_text, re.IGNORECASE \| re.MULTILINE) for s in [imrad_sections[3]]): imrad_found_count+=1
	if any(re.search(rf"^\s*Discussion\b", full_text, re.IGNORECASE \| re.MULTILINE) for s in [imrad_sections[4]]): imrad_found_count+=1

	has_abstract_section = bool(re.search(r"^\s*Abstract\b", full_text, re.IGNORECASE \| re.MULTILINE))

	return {
	"abstract_section_heading_present": has_abstract_section,
	"imrad_structure_partially_present": imrad_found_count >=3, # e.g. at least 3 of 4 main sections
	"imrad_sections_detected_count": imrad_found_count
	}

	def check_language_issues(text_for_lt: str) -> Dict[str, Any]:
	try:
	tool_path = os.environ.get('LT_PATH')
	# If LT_PATH is set, use it; otherwise, try remote server or allow LT to manage its server.
	# Default for language_tool_python if no server/path is given is to often start its own managed server.
	# Forcing remote_server=None if LT_PATH is given.
	language_tool = language_tool_python.LanguageTool(
	'en-US',
	remote_server='http://localhost:8081' if not tool_path else None,
	language_tool_path=tool_path if tool_path else None
	)
	matches = language_tool.check(text_for_lt)
	issues = []

	for match in matches:
	# Example: ignore a common false positive or stylistic choice
	if match.ruleId in ["EN_SPLIT_WORDS_HYPHEN", "UPPERCASE_SENTENCE_START", "MORFOLOGIK_RULE_EN_US"]:
	continue
	issues.append({
	"message": match.message,
	"context": match.context.strip(),
	"error_text_segment": match.context[match.contextOffset : match.contextOffset + match.errorLength],
	"suggestions": match.replacements[:3] if match.replacements else [],
	"category": match.category,
	"rule_id": match.ruleId,
	"offset": match.offset,
	"length": match.errorLength,
	})

	regex_pattern = r'\b(\w+)\[(\d+)\]'
	regex_matches = list(re.finditer(regex_pattern, text_for_lt))

	for match_re in regex_matches:
	word = match_re.group(1)
	number = match_re.group(2)
	issues.append({
	"message": f"Missing space before '[' in '{word}[{number}]'. Suggestion: '{word} [{number}]'.",
	"context": text_for_lt[max(match_re.start() - 40, 0):min(match_re.end() + 40, len(text_for_lt))].strip(),
	"error_text_segment": match_re.group(0),
	"suggestions": [f"{word} [{number}]"],
	"category": "Formatting",
	"rule_id": "MISSING_SPACE_BEFORE_BRACKET_CITATION",
	"offset": match_re.start(),
	"length": match_re.end() - match_re.start(),
	})

	return {"total_issues": len(issues), "issues": issues}
	except ConnectionRefusedError:
	error_msg = "LanguageTool Error: Connection to LT server (e.g., http://localhost:8081) refused. Ensure it's running, or configure LT_PATH for local JAR usage."
	print(error_msg)
	return {"error": error_msg, "issues": []}
	except Exception as e:
	error_msg = f"Error checking language issues: {type(e).__name__} - {e}"
	print(error_msg)
	# print(traceback.format_exc()) # For server-side debugging
	return {"error": error_msg, "issues": []}


	def check_figure_table_order(full_text: str) -> Dict[str, Any]:
	fig_pattern = r'(?:Fig(?:ure)?\.?\|Figure)\s*(\d+)'
	fig_refs_in_order = [int(num) for num in re.findall(fig_pattern, full_text, re.IGNORECASE)]

	tbl_pattern = r'Table\s*(\d+)'
	tbl_refs_in_order = [int(num) for num in re.findall(tbl_pattern, full_text, re.IGNORECASE)]

	def analyze_numbering(refs_list, item_type="Item"):
	issues = []
	if not refs_list:
	return {"references_in_order_of_appearance": [], f"{item_type}_numbering_issues": ["Not mentioned."]}

	unique_sorted_refs = sorted(list(set(refs_list)))
	max_ref_num = unique_sorted_refs[-1] if unique_sorted_refs else 0

	# Check for missing numbers in the sequence up to max_ref_num
	expected_sequence = list(range(1, max_ref_num + 1))
	missing_numbers = [num for num in expected_sequence if num not in unique_sorted_refs]
	if missing_numbers:
	issues.append(f"Missing {item_type.lower()}(s) in sequence up to {max_ref_num}: {missing_numbers}")

	# Check if first mention is sequential (e.g. Fig 1 then Fig 2, not Fig 2 then Fig 1)
	# This is a simplified check on the raw list of appearances.
	# A more robust check would track first appearances of unique numbers.
	# For now, if the list of unique items in appearance order is not sorted.
	first_occurrence_map = {}
	unique_refs_in_appearance_order = []
	for ref_num in refs_list:
	if ref_num not in first_occurrence_map:
	first_occurrence_map[ref_num] = True
	unique_refs_in_appearance_order.append(ref_num)

	if unique_refs_in_appearance_order != sorted(unique_refs_in_appearance_order):
	issues.append(f"{item_type}s may not be first mentioned in strict numerical order. Sequence of first mentions: {unique_refs_in_appearance_order}")

	return {
	"references_in_order_of_appearance": refs_list,
	f"{item_type.lower()}_numbering_issues": issues if issues else ["Appears OK based on simple checks."]
	}

	fig_analysis = analyze_numbering(fig_refs_in_order, "Figure")
	tbl_analysis = analyze_numbering(tbl_refs_in_order, "Table")

	return {fig_analysis, tbl_analysis}


	# ------------------------------
	# Highlighting Function
	# ------------------------------
	def highlight_issues_in_pdf(
	pdf_file_or_stream,
	word_coords_data: List[Dict[str, Any]],
	language_issues_list: List[Dict[str, Any]]
	# text_for_lt is implicitly used via offsets stored in language_issues_list
	) -> Tuple[List[Dict[str, Any]], bytes]:

	doc = None
	try:
	if isinstance(pdf_file_or_stream, str) and os.path.exists(pdf_file_or_stream):
	doc = fitz.open(pdf_file_or_stream)
	elif hasattr(pdf_file_or_stream, 'read'):
	pdf_file_or_stream.seek(0)
	doc = fitz.open(stream=pdf_file_or_stream.read(), filetype="pdf")
	elif isinstance(pdf_file_or_stream, bytes):
	doc = fitz.open(stream=pdf_file_or_stream, filetype="pdf")
	else:
	print(f"Unsupported PDF input type in highlight_issues_in_pdf: {type(pdf_file_or_stream)}")
	return language_issues_list, b"" # Return original issues, no PDF bytes
	except Exception as e:
	print(f"Error opening PDF in highlight_issues_in_pdf: {e}")
	return language_issues_list, b""

	issues_with_coords_and_page = []

	for issue_details in language_issues_list:
	issue_offset = issue_details["offset"]
	issue_length = issue_details["length"]
	error_text_to_search = issue_details["error_text_segment"]

	current_issue_output = issue_details.copy()
	current_issue_output["page"] = 0
	current_issue_output["coordinates"] = [] # [x0, y0, x1, y1]

	candidate_pdf_words_info = []
	for word_info in word_coords_data:
	word_start_offset = word_info['start_offset']
	word_end_offset = word_start_offset + len(word_info['text'])
	if word_start_offset < (issue_offset + issue_length) and issue_offset < word_end_offset:
	candidate_pdf_words_info.append(word_info)

	if not candidate_pdf_words_info:
	issues_with_coords_and_page.append(current_issue_output)
	continue

	page_num_for_issue = candidate_pdf_words_info[0]["page_num"]
	page_to_search_on = doc[page_num_for_issue]

	clip_search_rect = fitz.Rect(candidate_pdf_words_info[0]['rect'])
	for i in range(1, len(candidate_pdf_words_info)):
	clip_search_rect.include_rect(candidate_pdf_words_info[i]['rect'])

	clip_search_rect.x0 -= 3 # Small padding for search_for
	clip_search_rect.y0 -= 3
	clip_search_rect.x1 += 3
	clip_search_rect.y1 += 3
	clip_search_rect.normalize()

	found_rects_on_page = []
	if error_text_to_search.strip():
	try:
	# search_for is case-sensitive by default if query has mixed case.
	# LT error_text_segment usually preserves case.
	found_rects_on_page = page_to_search_on.search_for(error_text_to_search, clip=clip_search_rect, quads=False)
	except Exception as search_e:
	print(f"PyMuPDF search_for error: '{search_e}' for text '{error_text_to_search}' on page {page_num_for_issue+1}. Skipping this highlight.")

	if found_rects_on_page:
	current_issue_output["page"] = page_num_for_issue + 1

	overall_bounds = fitz.Rect(found_rects_on_page[0])
	for r_idx in range(1, len(found_rects_on_page)):
	overall_bounds.include_rect(found_rects_on_page[r_idx])
	current_issue_output["coordinates"] = [
	round(overall_bounds.x0, 2), round(overall_bounds.y0, 2),
	round(overall_bounds.x1, 2), round(overall_bounds.y1, 2)
	]

	for rect_to_highlight in found_rects_on_page:
	if not rect_to_highlight.is_empty and rect_to_highlight.width > 0.1 and rect_to_highlight.height > 0.1: # Min width/height
	highlight_annot = page_to_search_on.add_highlight_annot(rect_to_highlight)
	if highlight_annot:
	highlight_annot.set_colors(stroke=(1, 1, 0)) # Yellow
	highlight_annot.update(opacity=0.4) # Make highlight slightly transparent
	issues_with_coords_and_page.append(current_issue_output)

	output_pdf_bytes = io.BytesIO()
	try:
	doc.save(output_pdf_bytes, garbage=3, deflate=True) # Options for smaller size
	annotated_pdf_bytes_content = output_pdf_bytes.getvalue()
	except Exception as e:
	print(f"Error saving annotated PDF: {e}")
	annotated_pdf_bytes_content = b""
	finally:
	doc.close()
	output_pdf_bytes.close()

	return issues_with_coords_and_page, annotated_pdf_bytes_content

	# ------------------------------
	# Main Analysis Function
	# ------------------------------
	def analyze_pdf(pdf_input_data) -> Tuple[Dict[str, Any], bytes]:
	results = {"language_issues": [], "general_document_checks": {}, "analysis_errors": []}
	annotated_pdf_bytes = None

	# Ensure pdf_input_data can be read multiple times if it's a stream
	input_bytes_content = None
	if hasattr(pdf_input_data, 'read'):
	pdf_input_data.seek(0)
	input_bytes_content = pdf_input_data.read()
	# For functions below, create new BytesIO if they expect a stream
	elif isinstance(pdf_input_data, bytes):
	input_bytes_content = pdf_input_data
	elif isinstance(pdf_input_data, str) and os.path.exists(pdf_input_data): # Path
	with open(pdf_input_data, "rb") as f_path:
	input_bytes_content = f_path.read()
	else:
	results["analysis_errors"].append(f"Invalid PDF input data type: {type(pdf_input_data)}")
	return results, None

	if not input_bytes_content:
	results["analysis_errors"].append("PDF input data is empty or unreadable.")
	return results, None

	try:
	# General checks use PyMuPDF4LLM text
	pdf_stream_for_general = io.BytesIO(input_bytes_content)
	full_text_for_general_checks = extract_pdf_text_for_general_checks(pdf_stream_for_general)
	pdf_stream_for_general.close()

	if full_text_for_general_checks:
	results["general_document_checks"] = {
	"metadata": check_metadata(full_text_for_general_checks),
	"disclosures": check_disclosures(full_text_for_general_checks),
	"figures_tables_overview": check_figures_and_tables_overview(full_text_for_general_checks),
	"references_overview": check_references_overview(full_text_for_general_checks),
	"structure_overview": check_structure_overview(full_text_for_general_checks),
	"figure_table_order": check_figure_table_order(full_text_for_general_checks),
	}
	else:
	results["analysis_errors"].append("Failed to extract text using PyMuPDF4LLM for general checks.")

	# Language checks and highlighting use word-based extraction
	pdf_stream_for_lt = io.BytesIO(input_bytes_content)
	text_for_lt, word_coords_data = extract_word_data_and_text_for_lt(pdf_stream_for_lt)
	pdf_stream_for_lt.close()

	if not text_for_lt and not word_coords_data:
	results["analysis_errors"].append("Could not extract word data for language analysis and highlighting.")
	else:
	language_issues_result = check_language_issues(text_for_lt) # text_for_lt is passed here
	if "error" in language_issues_result:
	results["analysis_errors"].append(f"Language check error: {language_issues_result['error']}")

	lt_issues_list = language_issues_result.get("issues", [])

	if lt_issues_list:
	pdf_stream_for_highlighting = io.BytesIO(input_bytes_content)
	updated_lt_issues_list, annotated_pdf_bytes = highlight_issues_in_pdf(
	pdf_stream_for_highlighting,
	word_coords_data,
	lt_issues_list
	)
	pdf_stream_for_highlighting.close()
	results["language_issues"] = updated_lt_issues_list
	else: # No issues, or error in check_language_issues
	results["language_issues"] = lt_issues_list # Will be empty if no issues, or contain error if LT failed

	if not results["analysis_errors"]:
	del results["analysis_errors"]
	# Rename "issues" to "language_issues" in the top-level results for clarity
	if "issues" in results and "language_issues" not in results: # Should be handled by now
	results["language_issues"] = results.pop("issues")


	return results, annotated_pdf_bytes

	except Exception as e:
	error_msg = f"Critical error in analyze_pdf: {type(e).__name__} - {e}"
	print(error_msg)
	# print(traceback.format_exc()) # Server-side debug
	current_errors = results.get("analysis_errors", [])
	current_errors.append(error_msg)
	results["analysis_errors"] = current_errors
	return results, None

	# ------------------------------
	# Gradio Interface
	# ------------------------------
	def process_upload(file_bytes_from_gradio):
	if file_bytes_from_gradio is None:
	return json.dumps({"error_message": "No file uploaded"}, indent=2), None

	try:
	# analyze_pdf now robustly handles bytes or streams
	results, annotated_pdf_output_bytes = analyze_pdf(file_bytes_from_gradio)

	# Sanitize results for JSON (e.g., convert fitz.Rect if any slipped through)
	# This should ideally be handled within each check function if it returns complex objects not meant for JSON.
	# For now, assume results are JSON-serializable.

	results_json = json.dumps(results, indent=2, ensure_ascii=False)

	if annotated_pdf_output_bytes:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_annotated_pdf_file:
	tmp_annotated_pdf_file.write(annotated_pdf_output_bytes)
	return results_json, tmp_annotated_pdf_file.name # Gradio needs a path for gr.File output
	else:
	return results_json, None

	except Exception as e:
	error_msg = f"Error processing file in Gradio interface: {type(e).__name__} - {e}"
	print(error_msg)
	# print(traceback.format_exc()) # Server-side debug
	return json.dumps({"error_message": error_msg}, indent=2), None


	def create_interface():
	with gr.Blocks(title="PDF Analyzer", theme=gr.themes.Glass()) as interface: # Changed theme
	gr.Markdown("# PDF Document Analyzer")
	gr.Markdown(
	"Upload a PDF to check for common manuscript issues. "
	"Language checks use LanguageTool (EN-US). Ensure your LanguageTool setup is correct "
	"(e.g., local server on port 8081, or LT_PATH environment variable for local JAR)."
	)

	with gr.Row():
	file_input = gr.File(
	label="Upload PDF Document",
	file_types=[".pdf"],
	type="binary" # Receives bytes
	)

	analyze_btn = gr.Button("Analyze PDF", variant="primary", scale=0) # scale=0 for smaller button

	gr.Markdown("## Analysis Results")
	with gr.Tabs():
	with gr.TabItem("Detailed Report"):
	results_output = gr.JSON(label="JSON Report", scale=2) # Increased scale for more space
	with gr.TabItem("Annotated PDF"):
	# Changed to gr.File for download, as direct PDF viewer is not standard in Gradio
	pdf_output_display = gr.File(label="Download Annotated PDF (if issues were highlighted)", interactive=False)

	analyze_btn.click(
	fn=process_upload,
	inputs=[file_input],
	outputs=[results_output, pdf_output_display]
	)
	gr.Markdown("---")
	gr.Markdown("Developed with PyMuPDF, LanguageTool, and Gradio. Alpha version.")
	return interface

	if __name__ == "__main__":
	print("PDF Analyzer launching...")
	print("Ensure LanguageTool is accessible (e.g., server at http://localhost:8081 or LT_PATH set).")
	# Example: To run LT server: java -cp languagetool-server.jar org.languagetool.server.HTTPServer --port 8081 --allow-origin "*"
	# Example: os.environ['LT_PATH'] = '/path/to/languagetool-6.X/' (if you have the full distribution)

	interface = create_interface()
	interface.launch(
	share=True, # For ngrok public link
	# server_name="0.0.0.0", # To allow access from network
	# server_port=7860
	)