import os
import gradio as gr
from pdf2image import convert_from_path, pdfinfo_from_path
import zipfile
import shutil
import tempfile
from pathlib import Path
import traceback

def zip_folder(folder_path, output_path):
    """Create a zip archive from a folder with improved error handling"""
    try:
        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))
        return True, ""
    except Exception as e:
        error_msg = f"Error creating zip file: {str(e)}"
        print(error_msg)
        return False, error_msg

# Use more robust directory handling with pathlib
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
DIRECTORY = BASE_DIR / "image_reference" 
DIRECTORY_OUTPUT = BASE_DIR / "output"
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]

# Check and create directories
for directory in DIRECTORIES:
    directory.mkdir(parents=True, exist_ok=True)

ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']

def get_image_files(directory):
    """Get all image files from a directory with path validation"""
    directory = Path(directory)
    if not directory.exists() or not directory.is_dir():
        return []
    
    image_files = []
    for file_path in directory.glob('*'):
        if file_path.suffix.lower() in ALLOWED_EXTENSIONS:
            image_files.append(str(file_path))
    return image_files

def clear_directory(directory):
    """Safely clear a directory with error handling"""
    directory = Path(directory)
    if not directory.exists():
        return True, ""
    
    try:
        for item in directory.iterdir():
            if item.is_file() or item.is_symlink():
                item.unlink()
            elif item.is_dir():
                shutil.rmtree(item)
        return True, ""
    except Exception as e:
        error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
        print(error_msg)
        return False, error_msg

def extract_photos_from_pdf(file_pdf):
    """Extract all pages from a PDF as images"""
    # Update status at the beginning
    status_text = "Starting extraction process..."
    
    # Check if file is provided
    if file_pdf is None:
        return [], None, "Error: No file uploaded"
    
    # Clear directories for new extraction
    clear_success, clear_error = clear_directory(DIRECTORY)
    if not clear_success:
        return [], None, f"Error clearing directories: {clear_error}"
        
    clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
    if not clear_success:
        return [], None, f"Error clearing output directory: {clear_error}"
    
    try:
        # Get PDF path and info
        pdf_path = file_pdf.name
        
        try:
            info = pdfinfo_from_path(pdf_path)
            total_pages = info["Pages"]
        except Exception as e:
            error_details = traceback.format_exc()
            return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}"
        
        # Progress tracking variables
        batch_size = 10  # Smaller batch size for better progress visibility
        
        # Process PDF in batches
        for start_page in range(1, total_pages + 1, batch_size):
            end_page = min(start_page + batch_size - 1, total_pages)
            
            try:
                images = convert_from_path(
                    pdf_path, 
                    first_page=start_page, 
                    last_page=end_page,
                    dpi=150  # Adjustable DPI for quality vs size
                )
                
                for idx, image in enumerate(images, start=start_page):
                    image_path = DIRECTORY / f"{idx}.png"
                    image.save(str(image_path), 'PNG')
            except Exception as e:
                error_details = traceback.format_exc()
                return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}"
        
        # Get list of extracted images and sort them numerically
        images_pdf_list = get_image_files(DIRECTORY)
        if not images_pdf_list:
            return [], None, "No images could be extracted from the PDF."
        
        image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
        try:
            sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
        except Exception as e:
            # Fallback to unsorted if sorting fails
            sorted_names = image_names
            print(f"Error sorting images: {e}")
        
        # Create zip file of all images
        zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
        zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
        
        if zip_success:
            return (
                sorted_names,
                str(zip_path),
                f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF."
            )
        else:
            return (
                sorted_names,
                None,
                f"Images extracted but zip creation failed: {zip_error}"
            )
            
    except Exception as e:
        error_details = traceback.format_exc()
        return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}"

# Create Gradio interface with improved layout and error handling
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("""
            # PDF Image Extractor
            Upload a PDF file to extract all pages as images.
            """)
    
    with gr.Tab("PDF Extractor"):
        with gr.Row():
            with gr.Column(scale=1):
                file_pdf = gr.File(
                    file_types=['.pdf'], 
                    label="Upload PDF file"
                )
                
                with gr.Row():
                    btn = gr.Button("Extract Images", variant="primary")
                    clear_btn = gr.Button("Clear")
        
        with gr.Column():
            status = gr.Textbox(
                label="Status", 
                value="Upload a PDF and click 'Extract Images'",
                visible=True
            )
            gallery = gr.Gallery(
                label="Extracted Pages",
                show_label=True,
                elem_id="gallery",
                columns=3,
                object_fit="contain",
                height="auto"
            )
            download_btn = gr.File(
                label="Download All Images (ZIP)",
                visible=True
            )
    
    # Event handlers
    btn.click(
        fn=extract_photos_from_pdf,
        inputs=[file_pdf],
        outputs=[gallery, download_btn, status],
        api_name="extract"
    )
    
    def clear_outputs():
        return [], None, "Cleared. Upload a PDF to begin."
    
    clear_btn.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[gallery, download_btn, status]
    )

    # Example for demonstration
    example_path = "./examples/sample.pdf"
    if os.path.exists(example_path):
        gr.Examples(
            examples=[[example_path]],
            fn=extract_photos_from_pdf,
            inputs=[file_pdf],
            outputs=[gallery, download_btn, status],
            cache_examples=False
        )

if __name__ == "__main__":
    demo.launch()