import gradio as gr import tempfile from io import BytesIO from PIL import Image from ebooklib import epub from olmocr.model import process_pdf # your forked olmocr model def process_pdf_to_epub(pdf_file, title, author): # Run the OCR + LLM pipeline from olmocr print("Starting PDF processing...") page_results = process_pdf(pdf_file.name) # Create the EPUB book book = epub.EpubBook() book.set_identifier("id123456") book.set_title(title if title else "Untitled Document") book.set_language("en") if author: book.add_author(author) # Try to use the first page as cover try: with Image.open(pdf_file.name) as img: img.convert("RGB").save("cover.jpg", "JPEG") with open("cover.jpg", "rb") as f: cover_data = f.read() book.set_cover("cover.jpg", cover_data) except Exception as e: print("Could not generate cover:", e) # Add chapters from pages chapters = [] for i, page in enumerate(page_results): text = page.get("decoded", {}).get("natural_text", "") if not text.strip(): continue chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en") chapter.content = f"
{text.replace('\n', '
')}