vedaMD / src /enhanced_pdf_processor.py
sniro23's picture
Initial commit without binary files
19aaa42
import os
import argparse
from unstructured.partition.pdf import partition_pdf
import logging
from tqdm import tqdm
import re
# Set up logging
logging.basicConfig(filename='pdf_processing.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def process_pdf_with_unstructured(pdf_path, output_dir):
"""
Processes a PDF file using unstructured.io, extracts content, and saves it as a Markdown file.
Args:
pdf_path (str): The path to the input PDF file.
output_dir (str): The directory to save the output Markdown file and extracted images.
"""
if not os.path.exists(pdf_path):
logging.error(f"PDF file not found at {pdf_path}")
print(f"Error: PDF file not found at {pdf_path}")
return False
# Cleaned up filename for fallback citation
pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
# A simple regex to clean up common filename artifacts
cleaned_citation = re.sub(r'[\-_]', ' ', pdf_basename).replace('.pdf', '')
print(f"Processing {pdf_path} with unstructured.io...")
logging.info(f"Processing {pdf_path} with unstructured.io...")
try:
# Create a specific output directory for images from this PDF
pdf_output_dir = os.path.join(output_dir, pdf_basename)
os.makedirs(pdf_output_dir, exist_ok=True)
image_output_path = os.path.join(pdf_output_dir, "images")
os.makedirs(image_output_path, exist_ok=True)
print(f"Extracting images to: {image_output_path}")
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
extract_images_in_pdf=True,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4096,
new_after_n_chars=3800,
combine_text_under_n_chars=2048,
image_output_dir_path=image_output_path
)
# Attempt to extract a better citation from the document's text
try:
# Look for "Sri Lanka Journal of Obstetrics and Gynaecology" or similar patterns
full_text = "\\n".join([el.text for el in elements[:20]]) # Check first 20 elements
sljog_pattern = r"Sri Lanka Journal of Obstetrics and Gynaecology, \\d{4}; \\d+ \\(.*?\\): \\d+-\\d+"
match = re.search(sljog_pattern, full_text, re.IGNORECASE)
if match:
cleaned_citation = match.group(0).replace('\\n', ' ').strip()
else:
# Fallback to the first non-empty text element if no specific pattern is found
first_title = next((el.text for el in elements if el.text.strip()), None)
if first_title and len(first_title) < 150: # Assume titles are reasonably short
cleaned_citation = first_title.strip()
except Exception as citation_exc:
print(f"Could not automatically extract a detailed citation, falling back to filename. Reason: {citation_exc}")
logging.warning(f"Citation extraction failed for {pdf_path}, using fallback. Error: {citation_exc}")
markdown_content = []
# Add YAML frontmatter for citation
markdown_content.append("---")
markdown_content.append(f"citation: \"{cleaned_citation}\"")
markdown_content.append("---")
for element in elements:
if "Table" in str(type(element)):
if hasattr(element, 'metadata') and hasattr(element.metadata, 'text_as_html') and element.metadata.text_as_html:
markdown_content.append("## Table")
markdown_content.append(element.metadata.text_as_html)
elif "Image" in str(type(element)):
if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_path') and element.metadata.image_path:
image_filename = os.path.basename(element.metadata.image_path)
relative_image_path = os.path.join("images", image_filename)
markdown_content.append(f"![{element.text}]({relative_image_path})")
else:
markdown_content.append(element.text)
# Construct the output Markdown path
output_md_filename = f"{pdf_basename}.md"
output_md_path = os.path.join(pdf_output_dir, output_md_filename)
print(f"Saving Markdown output to: {output_md_path}")
with open(output_md_path, "w", encoding="utf-8") as f:
f.write("\\n\\n".join(markdown_content))
print(f"Successfully processed {pdf_path}")
logging.info(f"Successfully processed {pdf_path}")
return True
except Exception as e:
print(f"An error occurred while processing {pdf_path}: {e}")
logging.error(f"An error occurred while processing {pdf_path}: {e}")
return False
def process_directory(input_dir, output_dir):
"""
Processes all PDF files in a given directory.
Args:
input_dir (str): The path to the directory containing PDF files.
output_dir (str): The directory to save the output Markdown files.
"""
pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
if not pdf_files:
print(f"No PDF files found in {input_dir}")
return
print(f"Found {len(pdf_files)} PDF files to process.")
success_count = 0
failure_count = 0
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
pdf_path = os.path.join(input_dir, pdf_file)
if process_pdf_with_unstructured(pdf_path, output_dir):
success_count += 1
else:
failure_count += 1
print(f"\nProcessing complete.")
print(f"Successfully processed: {success_count} files")
print(f"Failed to process: {failure_count} files")
logging.info(f"Processing complete. Success: {success_count}, Failed: {failure_count}")
def main():
parser = argparse.ArgumentParser(description="Process a PDF file or a directory of PDF files with unstructured.io to extract content as Markdown.")
parser.add_argument("input_path", type=str, help="The path to the input PDF file or directory.")
parser.add_argument("--output_dir", type=str, default="src/processed_markdown", help="The directory to save the output Markdown file.")
args = parser.parse_args()
# Ensure the main output directory exists
os.makedirs(args.output_dir, exist_ok=True)
if os.path.isdir(args.input_path):
process_directory(args.input_path, args.output_dir)
elif os.path.isfile(args.input_path) and args.input_path.endswith('.pdf'):
process_pdf_with_unstructured(args.input_path, args.output_dir)
else:
print(f"Error: Invalid input path. Please provide a valid PDF file or a directory.")
logging.error(f"Invalid input path: {args.input_path}")
if __name__ == "__main__":
main()