import os import argparse from unstructured.partition.pdf import partition_pdf import logging from tqdm import tqdm import re # Set up logging logging.basicConfig(filename='pdf_processing.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def process_pdf_with_unstructured(pdf_path, output_dir): """ Processes a PDF file using unstructured.io, extracts content, and saves it as a Markdown file. Args: pdf_path (str): The path to the input PDF file. output_dir (str): The directory to save the output Markdown file and extracted images. """ if not os.path.exists(pdf_path): logging.error(f"PDF file not found at {pdf_path}") print(f"Error: PDF file not found at {pdf_path}") return False # Cleaned up filename for fallback citation pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0] # A simple regex to clean up common filename artifacts cleaned_citation = re.sub(r'[\-_]', ' ', pdf_basename).replace('.pdf', '') print(f"Processing {pdf_path} with unstructured.io...") logging.info(f"Processing {pdf_path} with unstructured.io...") try: # Create a specific output directory for images from this PDF pdf_output_dir = os.path.join(output_dir, pdf_basename) os.makedirs(pdf_output_dir, exist_ok=True) image_output_path = os.path.join(pdf_output_dir, "images") os.makedirs(image_output_path, exist_ok=True) print(f"Extracting images to: {image_output_path}") elements = partition_pdf( filename=pdf_path, strategy="hi_res", extract_images_in_pdf=True, infer_table_structure=True, chunking_strategy="by_title", max_characters=4096, new_after_n_chars=3800, combine_text_under_n_chars=2048, image_output_dir_path=image_output_path ) # Attempt to extract a better citation from the document's text try: # Look for "Sri Lanka Journal of Obstetrics and Gynaecology" or similar patterns full_text = "\\n".join([el.text for el in elements[:20]]) # Check first 20 elements sljog_pattern = r"Sri Lanka Journal of Obstetrics and Gynaecology, \\d{4}; \\d+ \\(.*?\\): \\d+-\\d+" match = re.search(sljog_pattern, full_text, re.IGNORECASE) if match: cleaned_citation = match.group(0).replace('\\n', ' ').strip() else: # Fallback to the first non-empty text element if no specific pattern is found first_title = next((el.text for el in elements if el.text.strip()), None) if first_title and len(first_title) < 150: # Assume titles are reasonably short cleaned_citation = first_title.strip() except Exception as citation_exc: print(f"Could not automatically extract a detailed citation, falling back to filename. Reason: {citation_exc}") logging.warning(f"Citation extraction failed for {pdf_path}, using fallback. Error: {citation_exc}") markdown_content = [] # Add YAML frontmatter for citation markdown_content.append("---") markdown_content.append(f"citation: \"{cleaned_citation}\"") markdown_content.append("---") for element in elements: if "Table" in str(type(element)): if hasattr(element, 'metadata') and hasattr(element.metadata, 'text_as_html') and element.metadata.text_as_html: markdown_content.append("## Table") markdown_content.append(element.metadata.text_as_html) elif "Image" in str(type(element)): if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_path') and element.metadata.image_path: image_filename = os.path.basename(element.metadata.image_path) relative_image_path = os.path.join("images", image_filename) markdown_content.append(f"![{element.text}]({relative_image_path})") else: markdown_content.append(element.text) # Construct the output Markdown path output_md_filename = f"{pdf_basename}.md" output_md_path = os.path.join(pdf_output_dir, output_md_filename) print(f"Saving Markdown output to: {output_md_path}") with open(output_md_path, "w", encoding="utf-8") as f: f.write("\\n\\n".join(markdown_content)) print(f"Successfully processed {pdf_path}") logging.info(f"Successfully processed {pdf_path}") return True except Exception as e: print(f"An error occurred while processing {pdf_path}: {e}") logging.error(f"An error occurred while processing {pdf_path}: {e}") return False def process_directory(input_dir, output_dir): """ Processes all PDF files in a given directory. Args: input_dir (str): The path to the directory containing PDF files. output_dir (str): The directory to save the output Markdown files. """ pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')] if not pdf_files: print(f"No PDF files found in {input_dir}") return print(f"Found {len(pdf_files)} PDF files to process.") success_count = 0 failure_count = 0 for pdf_file in tqdm(pdf_files, desc="Processing PDFs"): pdf_path = os.path.join(input_dir, pdf_file) if process_pdf_with_unstructured(pdf_path, output_dir): success_count += 1 else: failure_count += 1 print(f"\nProcessing complete.") print(f"Successfully processed: {success_count} files") print(f"Failed to process: {failure_count} files") logging.info(f"Processing complete. Success: {success_count}, Failed: {failure_count}") def main(): parser = argparse.ArgumentParser(description="Process a PDF file or a directory of PDF files with unstructured.io to extract content as Markdown.") parser.add_argument("input_path", type=str, help="The path to the input PDF file or directory.") parser.add_argument("--output_dir", type=str, default="src/processed_markdown", help="The directory to save the output Markdown file.") args = parser.parse_args() # Ensure the main output directory exists os.makedirs(args.output_dir, exist_ok=True) if os.path.isdir(args.input_path): process_directory(args.input_path, args.output_dir) elif os.path.isfile(args.input_path) and args.input_path.endswith('.pdf'): process_pdf_with_unstructured(args.input_path, args.output_dir) else: print(f"Error: Invalid input path. Please provide a valid PDF file or a directory.") logging.error(f"Invalid input path: {args.input_path}") if __name__ == "__main__": main()