Spaces:
Sleeping
Sleeping
File size: 6,958 Bytes
19aaa42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
import argparse
from unstructured.partition.pdf import partition_pdf
import logging
from tqdm import tqdm
import re
# Set up logging
logging.basicConfig(filename='pdf_processing.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def process_pdf_with_unstructured(pdf_path, output_dir):
"""
Processes a PDF file using unstructured.io, extracts content, and saves it as a Markdown file.
Args:
pdf_path (str): The path to the input PDF file.
output_dir (str): The directory to save the output Markdown file and extracted images.
"""
if not os.path.exists(pdf_path):
logging.error(f"PDF file not found at {pdf_path}")
print(f"Error: PDF file not found at {pdf_path}")
return False
# Cleaned up filename for fallback citation
pdf_basename = os.path.splitext(os.path.basename(pdf_path))[0]
# A simple regex to clean up common filename artifacts
cleaned_citation = re.sub(r'[\-_]', ' ', pdf_basename).replace('.pdf', '')
print(f"Processing {pdf_path} with unstructured.io...")
logging.info(f"Processing {pdf_path} with unstructured.io...")
try:
# Create a specific output directory for images from this PDF
pdf_output_dir = os.path.join(output_dir, pdf_basename)
os.makedirs(pdf_output_dir, exist_ok=True)
image_output_path = os.path.join(pdf_output_dir, "images")
os.makedirs(image_output_path, exist_ok=True)
print(f"Extracting images to: {image_output_path}")
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
extract_images_in_pdf=True,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4096,
new_after_n_chars=3800,
combine_text_under_n_chars=2048,
image_output_dir_path=image_output_path
)
# Attempt to extract a better citation from the document's text
try:
# Look for "Sri Lanka Journal of Obstetrics and Gynaecology" or similar patterns
full_text = "\\n".join([el.text for el in elements[:20]]) # Check first 20 elements
sljog_pattern = r"Sri Lanka Journal of Obstetrics and Gynaecology, \\d{4}; \\d+ \\(.*?\\): \\d+-\\d+"
match = re.search(sljog_pattern, full_text, re.IGNORECASE)
if match:
cleaned_citation = match.group(0).replace('\\n', ' ').strip()
else:
# Fallback to the first non-empty text element if no specific pattern is found
first_title = next((el.text for el in elements if el.text.strip()), None)
if first_title and len(first_title) < 150: # Assume titles are reasonably short
cleaned_citation = first_title.strip()
except Exception as citation_exc:
print(f"Could not automatically extract a detailed citation, falling back to filename. Reason: {citation_exc}")
logging.warning(f"Citation extraction failed for {pdf_path}, using fallback. Error: {citation_exc}")
markdown_content = []
# Add YAML frontmatter for citation
markdown_content.append("---")
markdown_content.append(f"citation: \"{cleaned_citation}\"")
markdown_content.append("---")
for element in elements:
if "Table" in str(type(element)):
if hasattr(element, 'metadata') and hasattr(element.metadata, 'text_as_html') and element.metadata.text_as_html:
markdown_content.append("## Table")
markdown_content.append(element.metadata.text_as_html)
elif "Image" in str(type(element)):
if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_path') and element.metadata.image_path:
image_filename = os.path.basename(element.metadata.image_path)
relative_image_path = os.path.join("images", image_filename)
markdown_content.append(f"")
else:
markdown_content.append(element.text)
# Construct the output Markdown path
output_md_filename = f"{pdf_basename}.md"
output_md_path = os.path.join(pdf_output_dir, output_md_filename)
print(f"Saving Markdown output to: {output_md_path}")
with open(output_md_path, "w", encoding="utf-8") as f:
f.write("\\n\\n".join(markdown_content))
print(f"Successfully processed {pdf_path}")
logging.info(f"Successfully processed {pdf_path}")
return True
except Exception as e:
print(f"An error occurred while processing {pdf_path}: {e}")
logging.error(f"An error occurred while processing {pdf_path}: {e}")
return False
def process_directory(input_dir, output_dir):
"""
Processes all PDF files in a given directory.
Args:
input_dir (str): The path to the directory containing PDF files.
output_dir (str): The directory to save the output Markdown files.
"""
pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
if not pdf_files:
print(f"No PDF files found in {input_dir}")
return
print(f"Found {len(pdf_files)} PDF files to process.")
success_count = 0
failure_count = 0
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
pdf_path = os.path.join(input_dir, pdf_file)
if process_pdf_with_unstructured(pdf_path, output_dir):
success_count += 1
else:
failure_count += 1
print(f"\nProcessing complete.")
print(f"Successfully processed: {success_count} files")
print(f"Failed to process: {failure_count} files")
logging.info(f"Processing complete. Success: {success_count}, Failed: {failure_count}")
def main():
parser = argparse.ArgumentParser(description="Process a PDF file or a directory of PDF files with unstructured.io to extract content as Markdown.")
parser.add_argument("input_path", type=str, help="The path to the input PDF file or directory.")
parser.add_argument("--output_dir", type=str, default="src/processed_markdown", help="The directory to save the output Markdown file.")
args = parser.parse_args()
# Ensure the main output directory exists
os.makedirs(args.output_dir, exist_ok=True)
if os.path.isdir(args.input_path):
process_directory(args.input_path, args.output_dir)
elif os.path.isfile(args.input_path) and args.input_path.endswith('.pdf'):
process_pdf_with_unstructured(args.input_path, args.output_dir)
else:
print(f"Error: Invalid input path. Please provide a valid PDF file or a directory.")
logging.error(f"Invalid input path: {args.input_path}")
if __name__ == "__main__":
main() |