vedaMD

Sleeping

App Files Files Community

vedaMD / src /enhanced_pdf_processor.py

sniro23

Initial commit without binary files

19aaa42 about 1 month ago

raw

history blame contribute delete

6.96 kB

	import os
	import argparse
	from unstructured.partition.pdf import partition_pdf
	import logging
	from tqdm import tqdm
	import re

	# Set up logging
	logging.basicConfig(filename='pdf_processing.log', level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s')

	def process_pdf_with_unstructured(pdf_path, output_dir):
	"""
	Processes a PDF file using unstructured.io, extracts content, and saves it as a Markdown file.

	Args:
	pdf_path (str): The path to the input PDF file.
	output_dir (str): The directory to save the output Markdown file and extracted images.
	"""
	if not os.path.exists(pdf_path):
	logging.error(f"PDF file not found at {pdf_path}")
	print(f"Error: PDF file not found at {pdf_path}")
	return False

	# Cleaned up filename for fallback citation
	pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
	# A simple regex to clean up common filename artifacts
	cleaned_citation = re.sub(r'[\-_]', ' ', pdf_basename).replace('.pdf', '')

	print(f"Processing {pdf_path} with unstructured.io...")
	logging.info(f"Processing {pdf_path} with unstructured.io...")

	try:
	# Create a specific output directory for images from this PDF
	pdf_output_dir = os.path.join(output_dir, pdf_basename)
	os.makedirs(pdf_output_dir, exist_ok=True)

	image_output_path = os.path.join(pdf_output_dir, "images")
	os.makedirs(image_output_path, exist_ok=True)

	print(f"Extracting images to: {image_output_path}")

	elements = partition_pdf(
	filename=pdf_path,
	strategy="hi_res",
	extract_images_in_pdf=True,
	infer_table_structure=True,
	chunking_strategy="by_title",
	max_characters=4096,
	new_after_n_chars=3800,
	combine_text_under_n_chars=2048,
	image_output_dir_path=image_output_path
	)

	# Attempt to extract a better citation from the document's text
	try:
	# Look for "Sri Lanka Journal of Obstetrics and Gynaecology" or similar patterns
	full_text = "\\n".join([el.text for el in elements[:20]]) # Check first 20 elements
	sljog_pattern = r"Sri Lanka Journal of Obstetrics and Gynaecology, \\d{4}; \\d+ \\(.*?\\): \\d+-\\d+"
	match = re.search(sljog_pattern, full_text, re.IGNORECASE)
	if match:
	cleaned_citation = match.group(0).replace('\\n', ' ').strip()
	else:
	# Fallback to the first non-empty text element if no specific pattern is found
	first_title = next((el.text for el in elements if el.text.strip()), None)
	if first_title and len(first_title) < 150: # Assume titles are reasonably short
	cleaned_citation = first_title.strip()
	except Exception as citation_exc:
	print(f"Could not automatically extract a detailed citation, falling back to filename. Reason: {citation_exc}")
	logging.warning(f"Citation extraction failed for {pdf_path}, using fallback. Error: {citation_exc}")

	markdown_content = []
	# Add YAML frontmatter for citation
	markdown_content.append("---")
	markdown_content.append(f"citation: \"{cleaned_citation}\"")
	markdown_content.append("---")

	for element in elements:
	if "Table" in str(type(element)):
	if hasattr(element, 'metadata') and hasattr(element.metadata, 'text_as_html') and element.metadata.text_as_html:
	markdown_content.append("## Table")
	markdown_content.append(element.metadata.text_as_html)
	elif "Image" in str(type(element)):
	if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_path') and element.metadata.image_path:
	image_filename = os.path.basename(element.metadata.image_path)
	relative_image_path = os.path.join("images", image_filename)
	markdown_content.append(f"![{element.text}]({relative_image_path})")
	else:
	markdown_content.append(element.text)

	# Construct the output Markdown path
	output_md_filename = f"{pdf_basename}.md"
	output_md_path = os.path.join(pdf_output_dir, output_md_filename)

	print(f"Saving Markdown output to: {output_md_path}")

	with open(output_md_path, "w", encoding="utf-8") as f:
	f.write("\\n\\n".join(markdown_content))

	print(f"Successfully processed {pdf_path}")
	logging.info(f"Successfully processed {pdf_path}")
	return True

	except Exception as e:
	print(f"An error occurred while processing {pdf_path}: {e}")
	logging.error(f"An error occurred while processing {pdf_path}: {e}")
	return False

	def process_directory(input_dir, output_dir):
	"""
	Processes all PDF files in a given directory.

	Args:
	input_dir (str): The path to the directory containing PDF files.
	output_dir (str): The directory to save the output Markdown files.
	"""
	pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
	if not pdf_files:
	print(f"No PDF files found in {input_dir}")
	return

	print(f"Found {len(pdf_files)} PDF files to process.")

	success_count = 0
	failure_count = 0

	for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
	pdf_path = os.path.join(input_dir, pdf_file)
	if process_pdf_with_unstructured(pdf_path, output_dir):
	success_count += 1
	else:
	failure_count += 1

	print(f"\nProcessing complete.")
	print(f"Successfully processed: {success_count} files")
	print(f"Failed to process: {failure_count} files")
	logging.info(f"Processing complete. Success: {success_count}, Failed: {failure_count}")


	def main():
	parser = argparse.ArgumentParser(description="Process a PDF file or a directory of PDF files with unstructured.io to extract content as Markdown.")
	parser.add_argument("input_path", type=str, help="The path to the input PDF file or directory.")
	parser.add_argument("--output_dir", type=str, default="src/processed_markdown", help="The directory to save the output Markdown file.")

	args = parser.parse_args()

	# Ensure the main output directory exists
	os.makedirs(args.output_dir, exist_ok=True)

	if os.path.isdir(args.input_path):
	process_directory(args.input_path, args.output_dir)
	elif os.path.isfile(args.input_path) and args.input_path.endswith('.pdf'):
	process_pdf_with_unstructured(args.input_path, args.output_dir)
	else:
	print(f"Error: Invalid input path. Please provide a valid PDF file or a directory.")
	logging.error(f"Invalid input path: {args.input_path}")


	if __name__ == "__main__":
	main()