import os import gradio as gr from pdf2image import convert_from_path, pdfinfo_from_path import zipfile import shutil import tempfile from pathlib import Path import traceback def zip_folder(folder_path, output_path): """Create a zip archive from a folder with improved error handling""" try: with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, _, files in os.walk(folder_path): for file in files: file_path = os.path.join(root, file) zipf.write(file_path, os.path.relpath(file_path, folder_path)) return True, "" except Exception as e: error_msg = f"Error creating zip file: {str(e)}" print(error_msg) return False, error_msg # Use more robust directory handling with pathlib BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor" DIRECTORY = BASE_DIR / "image_reference" DIRECTORY_OUTPUT = BASE_DIR / "output" DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT] # Check and create directories for directory in DIRECTORIES: directory.mkdir(parents=True, exist_ok=True) ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif'] def get_image_files(directory): """Get all image files from a directory with path validation""" directory = Path(directory) if not directory.exists() or not directory.is_dir(): return [] image_files = [] for file_path in directory.glob('*'): if file_path.suffix.lower() in ALLOWED_EXTENSIONS: image_files.append(str(file_path)) return image_files def clear_directory(directory): """Safely clear a directory with error handling""" directory = Path(directory) if not directory.exists(): return True, "" try: for item in directory.iterdir(): if item.is_file() or item.is_symlink(): item.unlink() elif item.is_dir(): shutil.rmtree(item) return True, "" except Exception as e: error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}" print(error_msg) return False, error_msg def extract_photos_from_pdf(file_pdf): """Extract all pages from a PDF as images""" # Update status at the beginning status_text = "Starting extraction process..." # Check if file is provided if file_pdf is None: return [], None, "Error: No file uploaded" # Clear directories for new extraction clear_success, clear_error = clear_directory(DIRECTORY) if not clear_success: return [], None, f"Error clearing directories: {clear_error}" clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT) if not clear_success: return [], None, f"Error clearing output directory: {clear_error}" try: # Get PDF path and info pdf_path = file_pdf.name try: info = pdfinfo_from_path(pdf_path) total_pages = info["Pages"] except Exception as e: error_details = traceback.format_exc() return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}" # Progress tracking variables batch_size = 10 # Smaller batch size for better progress visibility # Process PDF in batches for start_page in range(1, total_pages + 1, batch_size): end_page = min(start_page + batch_size - 1, total_pages) try: images = convert_from_path( pdf_path, first_page=start_page, last_page=end_page, dpi=150 # Adjustable DPI for quality vs size ) for idx, image in enumerate(images, start=start_page): image_path = DIRECTORY / f"{idx}.png" image.save(str(image_path), 'PNG') except Exception as e: error_details = traceback.format_exc() return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}" # Get list of extracted images and sort them numerically images_pdf_list = get_image_files(DIRECTORY) if not images_pdf_list: return [], None, "No images could be extracted from the PDF." image_names = [(path, os.path.basename(path)) for path in images_pdf_list] try: sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem)) except Exception as e: # Fallback to unsorted if sorting fails sorted_names = image_names print(f"Error sorting images: {e}") # Create zip file of all images zip_path = DIRECTORY_OUTPUT / "all_photos.zip" zip_success, zip_error = zip_folder(DIRECTORY, zip_path) if zip_success: return ( sorted_names, str(zip_path), f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF." ) else: return ( sorted_names, None, f"Images extracted but zip creation failed: {zip_error}" ) except Exception as e: error_details = traceback.format_exc() return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}" # Create Gradio interface with improved layout and error handling with gr.Blocks(theme=gr.themes.Soft()) as demo: with gr.Row(): with gr.Column(scale=1): gr.Markdown(""" # PDF Image Extractor Upload a PDF file to extract all pages as images. """) with gr.Tab("PDF Extractor"): with gr.Row(): with gr.Column(scale=1): file_pdf = gr.File( file_types=['.pdf'], label="Upload PDF file" ) with gr.Row(): btn = gr.Button("Extract Images", variant="primary") clear_btn = gr.Button("Clear") with gr.Column(): status = gr.Textbox( label="Status", value="Upload a PDF and click 'Extract Images'", visible=True ) gallery = gr.Gallery( label="Extracted Pages", show_label=True, elem_id="gallery", columns=3, object_fit="contain", height="auto" ) download_btn = gr.File( label="Download All Images (ZIP)", visible=True ) # Event handlers btn.click( fn=extract_photos_from_pdf, inputs=[file_pdf], outputs=[gallery, download_btn, status], api_name="extract" ) def clear_outputs(): return [], None, "Cleared. Upload a PDF to begin." clear_btn.click( fn=clear_outputs, inputs=[], outputs=[gallery, download_btn, status] ) # Example for demonstration example_path = "./examples/sample.pdf" if os.path.exists(example_path): gr.Examples( examples=[[example_path]], fn=extract_photos_from_pdf, inputs=[file_pdf], outputs=[gallery, download_btn, status], cache_examples=False ) if __name__ == "__main__": demo.launch()