vedaMD

Sleeping

File size: 6,958 Bytes

19aaa42

import os
import argparse
from unstructured.partition.pdf import partition_pdf
import logging
from tqdm import tqdm
import re

# Set up logging
logging.basicConfig(filename='pdf_processing.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def process_pdf_with_unstructured(pdf_path, output_dir):
    """
    Processes a PDF file using unstructured.io, extracts content, and saves it as a Markdown file.

    Args:
        pdf_path (str): The path to the input PDF file.
        output_dir (str): The directory to save the output Markdown file and extracted images.
    """
    if not os.path.exists(pdf_path):
        logging.error(f"PDF file not found at {pdf_path}")
        print(f"Error: PDF file not found at {pdf_path}")
        return False

    # Cleaned up filename for fallback citation
    pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
    # A simple regex to clean up common filename artifacts
    cleaned_citation = re.sub(r'[\-_]', ' ', pdf_basename).replace('.pdf', '')

    print(f"Processing {pdf_path} with unstructured.io...")
    logging.info(f"Processing {pdf_path} with unstructured.io...")

    try:
        # Create a specific output directory for images from this PDF
        pdf_output_dir = os.path.join(output_dir, pdf_basename)
        os.makedirs(pdf_output_dir, exist_ok=True)
        
        image_output_path = os.path.join(pdf_output_dir, "images")
        os.makedirs(image_output_path, exist_ok=True)
        
        print(f"Extracting images to: {image_output_path}")

        elements = partition_pdf(
            filename=pdf_path,
            strategy="hi_res",
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=4096,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2048,
            image_output_dir_path=image_output_path 
        )

        # Attempt to extract a better citation from the document's text
        try:
            # Look for "Sri Lanka Journal of Obstetrics and Gynaecology" or similar patterns
            full_text = "\\n".join([el.text for el in elements[:20]]) # Check first 20 elements
            sljog_pattern = r"Sri Lanka Journal of Obstetrics and Gynaecology, \\d{4}; \\d+ \\(.*?\\): \\d+-\\d+"
            match = re.search(sljog_pattern, full_text, re.IGNORECASE)
            if match:
                cleaned_citation = match.group(0).replace('\\n', ' ').strip()
            else:
                # Fallback to the first non-empty text element if no specific pattern is found
                first_title = next((el.text for el in elements if el.text.strip()), None)
                if first_title and len(first_title) < 150: # Assume titles are reasonably short
                    cleaned_citation = first_title.strip()
        except Exception as citation_exc:
            print(f"Could not automatically extract a detailed citation, falling back to filename. Reason: {citation_exc}")
            logging.warning(f"Citation extraction failed for {pdf_path}, using fallback. Error: {citation_exc}")

        markdown_content = []
        # Add YAML frontmatter for citation
        markdown_content.append("---")
        markdown_content.append(f"citation: \"{cleaned_citation}\"")
        markdown_content.append("---")

        for element in elements:
            if "Table" in str(type(element)):
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'text_as_html') and element.metadata.text_as_html:
                    markdown_content.append("## Table")
                    markdown_content.append(element.metadata.text_as_html)
            elif "Image" in str(type(element)):
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_path') and element.metadata.image_path:
                    image_filename = os.path.basename(element.metadata.image_path)
                    relative_image_path = os.path.join("images", image_filename)
                    markdown_content.append(f"![{element.text}]({relative_image_path})")
            else:
                markdown_content.append(element.text)

        # Construct the output Markdown path
        output_md_filename = f"{pdf_basename}.md"
        output_md_path = os.path.join(pdf_output_dir, output_md_filename)
        
        print(f"Saving Markdown output to: {output_md_path}")

        with open(output_md_path, "w", encoding="utf-8") as f:
            f.write("\\n\\n".join(markdown_content))

        print(f"Successfully processed {pdf_path}")
        logging.info(f"Successfully processed {pdf_path}")
        return True

    except Exception as e:
        print(f"An error occurred while processing {pdf_path}: {e}")
        logging.error(f"An error occurred while processing {pdf_path}: {e}")
        return False

def process_directory(input_dir, output_dir):
    """
    Processes all PDF files in a given directory.

    Args:
        input_dir (str): The path to the directory containing PDF files.
        output_dir (str): The directory to save the output Markdown files.
    """
    pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return

    print(f"Found {len(pdf_files)} PDF files to process.")
    
    success_count = 0
    failure_count = 0

    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(input_dir, pdf_file)
        if process_pdf_with_unstructured(pdf_path, output_dir):
            success_count += 1
        else:
            failure_count += 1
    
    print(f"\nProcessing complete.")
    print(f"Successfully processed: {success_count} files")
    print(f"Failed to process: {failure_count} files")
    logging.info(f"Processing complete. Success: {success_count}, Failed: {failure_count}")


def main():
    parser = argparse.ArgumentParser(description="Process a PDF file or a directory of PDF files with unstructured.io to extract content as Markdown.")
    parser.add_argument("input_path", type=str, help="The path to the input PDF file or directory.")
    parser.add_argument("--output_dir", type=str, default="src/processed_markdown", help="The directory to save the output Markdown file.")
    
    args = parser.parse_args()

    # Ensure the main output directory exists
    os.makedirs(args.output_dir, exist_ok=True)
    
    if os.path.isdir(args.input_path):
        process_directory(args.input_path, args.output_dir)
    elif os.path.isfile(args.input_path) and args.input_path.endswith('.pdf'):
        process_pdf_with_unstructured(args.input_path, args.output_dir)
    else:
        print(f"Error: Invalid input path. Please provide a valid PDF file or a directory.")
        logging.error(f"Invalid input path: {args.input_path}")


if __name__ == "__main__":
    main()