Spaces:
Running
Running
import os | |
import tempfile | |
from pathlib import Path | |
import gradio as gr | |
from ebooklib import epub | |
from olmocr import process_pdf # your forked olmocr model | |
def process_pdf_to_epub(pdf_path, title="Untitled", author="Unknown"): | |
print(f"Processing PDF: {pdf_path}") | |
output_dir = tempfile.mkdtemp() | |
results = process_pdf(pdf_path, output_dir) | |
book = epub.EpubBook() | |
book.set_identifier("id123456") | |
book.set_title(title) | |
book.set_language("en") | |
book.add_author(author) | |
chapters = [] | |
for i, result in enumerate(results): | |
text = result.get("decoded_content", {}).get("natural_text", "") | |
if not text: | |
continue | |
chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en") | |
# Avoid backslash in f-string expression | |
safe_text = text.replace("\n", "<br/>") | |
chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>" | |
book.add_item(chapter) | |
chapters.append(chapter) | |
print(f"Processed page {i+1}") | |
if not chapters: | |
raise ValueError("No content extracted from PDF.") | |
book.toc = tuple(chapters) | |
book.add_item(epub.EpubNcx()) | |
book.add_item(epub.EpubNav()) | |
book.spine = ["nav"] + chapters | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp: | |
epub.write_epub(tmp.name, book) | |
print(f"EPUB written to {tmp.name}") | |
return tmp.name | |
# Gradio UI | |
title_input = gr.Textbox(label="EPUB Title", value="Untitled") | |
author_input = gr.Textbox(label="Author", value="Unknown") | |
file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
output_file = gr.File(label="Download EPUB") | |
iface = gr.Interface( | |
fn=process_pdf_to_epub, | |
inputs=[file_input, title_input, author_input], | |
outputs=output_file, | |
title="PDF to EPUB Converter with olmOCR", | |
description="Upload a PDF to convert it into an EPUB. First page is used as the cover." | |
) | |
if __name__ == "__main__": | |
iface.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True, | |
debug=True, | |
allowed_paths=["/tmp"] | |
) | |