File size: 6,958 Bytes
19aaa42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import argparse
from unstructured.partition.pdf import partition_pdf
import logging
from tqdm import tqdm
import re

# Set up logging
logging.basicConfig(filename='pdf_processing.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def process_pdf_with_unstructured(pdf_path, output_dir):
    """
    Processes a PDF file using unstructured.io, extracts content, and saves it as a Markdown file.

    Args:
        pdf_path (str): The path to the input PDF file.
        output_dir (str): The directory to save the output Markdown file and extracted images.
    """
    if not os.path.exists(pdf_path):
        logging.error(f"PDF file not found at {pdf_path}")
        print(f"Error: PDF file not found at {pdf_path}")
        return False

    # Cleaned up filename for fallback citation
    pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
    # A simple regex to clean up common filename artifacts
    cleaned_citation = re.sub(r'[\-_]', ' ', pdf_basename).replace('.pdf', '')

    print(f"Processing {pdf_path} with unstructured.io...")
    logging.info(f"Processing {pdf_path} with unstructured.io...")

    try:
        # Create a specific output directory for images from this PDF
        pdf_output_dir = os.path.join(output_dir, pdf_basename)
        os.makedirs(pdf_output_dir, exist_ok=True)
        
        image_output_path = os.path.join(pdf_output_dir, "images")
        os.makedirs(image_output_path, exist_ok=True)
        
        print(f"Extracting images to: {image_output_path}")

        elements = partition_pdf(
            filename=pdf_path,
            strategy="hi_res",
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=4096,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2048,
            image_output_dir_path=image_output_path 
        )

        # Attempt to extract a better citation from the document's text
        try:
            # Look for "Sri Lanka Journal of Obstetrics and Gynaecology" or similar patterns
            full_text = "\\n".join([el.text for el in elements[:20]]) # Check first 20 elements
            sljog_pattern = r"Sri Lanka Journal of Obstetrics and Gynaecology, \\d{4}; \\d+ \\(.*?\\): \\d+-\\d+"
            match = re.search(sljog_pattern, full_text, re.IGNORECASE)
            if match:
                cleaned_citation = match.group(0).replace('\\n', ' ').strip()
            else:
                # Fallback to the first non-empty text element if no specific pattern is found
                first_title = next((el.text for el in elements if el.text.strip()), None)
                if first_title and len(first_title) < 150: # Assume titles are reasonably short
                    cleaned_citation = first_title.strip()
        except Exception as citation_exc:
            print(f"Could not automatically extract a detailed citation, falling back to filename. Reason: {citation_exc}")
            logging.warning(f"Citation extraction failed for {pdf_path}, using fallback. Error: {citation_exc}")

        markdown_content = []
        # Add YAML frontmatter for citation
        markdown_content.append("---")
        markdown_content.append(f"citation: \"{cleaned_citation}\"")
        markdown_content.append("---")

        for element in elements:
            if "Table" in str(type(element)):
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'text_as_html') and element.metadata.text_as_html:
                    markdown_content.append("## Table")
                    markdown_content.append(element.metadata.text_as_html)
            elif "Image" in str(type(element)):
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_path') and element.metadata.image_path:
                    image_filename = os.path.basename(element.metadata.image_path)
                    relative_image_path = os.path.join("images", image_filename)
                    markdown_content.append(f"![{element.text}]({relative_image_path})")
            else:
                markdown_content.append(element.text)

        # Construct the output Markdown path
        output_md_filename = f"{pdf_basename}.md"
        output_md_path = os.path.join(pdf_output_dir, output_md_filename)
        
        print(f"Saving Markdown output to: {output_md_path}")

        with open(output_md_path, "w", encoding="utf-8") as f:
            f.write("\\n\\n".join(markdown_content))

        print(f"Successfully processed {pdf_path}")
        logging.info(f"Successfully processed {pdf_path}")
        return True

    except Exception as e:
        print(f"An error occurred while processing {pdf_path}: {e}")
        logging.error(f"An error occurred while processing {pdf_path}: {e}")
        return False

def process_directory(input_dir, output_dir):
    """
    Processes all PDF files in a given directory.

    Args:
        input_dir (str): The path to the directory containing PDF files.
        output_dir (str): The directory to save the output Markdown files.
    """
    pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return

    print(f"Found {len(pdf_files)} PDF files to process.")
    
    success_count = 0
    failure_count = 0

    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(input_dir, pdf_file)
        if process_pdf_with_unstructured(pdf_path, output_dir):
            success_count += 1
        else:
            failure_count += 1
    
    print(f"\nProcessing complete.")
    print(f"Successfully processed: {success_count} files")
    print(f"Failed to process: {failure_count} files")
    logging.info(f"Processing complete. Success: {success_count}, Failed: {failure_count}")


def main():
    parser = argparse.ArgumentParser(description="Process a PDF file or a directory of PDF files with unstructured.io to extract content as Markdown.")
    parser.add_argument("input_path", type=str, help="The path to the input PDF file or directory.")
    parser.add_argument("--output_dir", type=str, default="src/processed_markdown", help="The directory to save the output Markdown file.")
    
    args = parser.parse_args()

    # Ensure the main output directory exists
    os.makedirs(args.output_dir, exist_ok=True)
    
    if os.path.isdir(args.input_path):
        process_directory(args.input_path, args.output_dir)
    elif os.path.isfile(args.input_path) and args.input_path.endswith('.pdf'):
        process_pdf_with_unstructured(args.input_path, args.output_dir)
    else:
        print(f"Error: Invalid input path. Please provide a valid PDF file or a directory.")
        logging.error(f"Invalid input path: {args.input_path}")


if __name__ == "__main__":
    main()