Spaces:

leonarb
/

olmocr-demo

Running

App Files Files Community

olmocr-demo / app.py

leonarb

Reset all fixes

6a0411c verified 3 months ago

raw

history blame

2.15 kB

	import os
	import tempfile
	from pathlib import Path

	import gradio as gr
	from ebooklib import epub
	from olmocr import process_pdf # your forked olmocr model

	def process_pdf_to_epub(pdf_path, title="Untitled", author="Unknown"):
	print(f"Processing PDF: {pdf_path}")
	output_dir = tempfile.mkdtemp()
	results = process_pdf(pdf_path, output_dir)

	book = epub.EpubBook()
	book.set_identifier("id123456")
	book.set_title(title)
	book.set_language("en")
	book.add_author(author)

	chapters = []

	for i, result in enumerate(results):
	text = result.get("decoded_content", {}).get("natural_text", "")
	if not text:
	continue

	chapter = epub.EpubHtml(title=f"Page {i+1}", file_name=f"page_{i+1}.xhtml", lang="en")
	# Avoid backslash in f-string expression
	safe_text = text.replace("\n", "<br/>")
	chapter.content = f"<h1>Page {i+1}</h1><p>{safe_text}</p>"
	book.add_item(chapter)
	chapters.append(chapter)

	print(f"Processed page {i+1}")

	if not chapters:
	raise ValueError("No content extracted from PDF.")

	book.toc = tuple(chapters)
	book.add_item(epub.EpubNcx())
	book.add_item(epub.EpubNav())
	book.spine = ["nav"] + chapters

	with tempfile.NamedTemporaryFile(delete=False, suffix=".epub", dir="/tmp") as tmp:
	epub.write_epub(tmp.name, book)
	print(f"EPUB written to {tmp.name}")
	return tmp.name

	# Gradio UI
	title_input = gr.Textbox(label="EPUB Title", value="Untitled")
	author_input = gr.Textbox(label="Author", value="Unknown")
	file_input = gr.File(label="Upload PDF", file_types=[".pdf"])

	output_file = gr.File(label="Download EPUB")

	iface = gr.Interface(
	fn=process_pdf_to_epub,
	inputs=[file_input, title_input, author_input],
	outputs=output_file,
	title="PDF to EPUB Converter with olmOCR",
	description="Upload a PDF to convert it into an EPUB. First page is used as the cover."
	)

	if __name__ == "__main__":
	iface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=True,
	allowed_paths=["/tmp"]
	)