mineru2 / runpod_handler.py
marcosremar2's picture
Add RunPod serverless configuration with GitHub integration
4112422
import runpod
import tempfile
import os
import sys
import json
import base64
from pathlib import Path
from loguru import logger
# Add current directory to path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Import MinerU converter
from pdf_converter_mineru import PdfConverter
# Initialize converter with model path
CONVERTER = None
def initialize_converter():
"""Initialize the PDF converter once"""
global CONVERTER
if CONVERTER is None:
logger.info("Initializing MinerU converter...")
model_path = os.environ.get('MINERU_MODEL_PATH', '/app/models')
# Create config
config = {
"model_dir": model_path,
"output_dir": "/tmp/mineru_output",
"device": "cuda" if os.path.exists('/dev/nvidia0') else "cpu",
"parse_method": "auto",
"debug": False
}
CONVERTER = PdfConverter(config)
logger.info("MinerU converter initialized successfully")
def handler(job):
"""
RunPod serverless handler for PDF to Markdown conversion
"""
try:
# Initialize converter on first run
initialize_converter()
job_input = job["input"]
# Get PDF data from base64
pdf_base64 = job_input.get("pdf_base64")
filename = job_input.get("filename", "document.pdf")
if not pdf_base64:
return {"error": "No PDF data provided", "status": "failed"}
# Decode base64 PDF
pdf_data = base64.b64decode(pdf_base64)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
tmp_file.write(pdf_data)
pdf_path = tmp_file.name
logger.info(f"Processing PDF: {filename} ({len(pdf_data)} bytes)")
# Convert PDF to Markdown using MinerU
try:
output_dir = CONVERTER.convert_single_pdf(pdf_path)
# Find the markdown file in output
md_files = list(Path(output_dir).glob("**/*.md"))
if md_files:
with open(md_files[0], 'r', encoding='utf-8') as f:
markdown_content = f.read()
else:
# Fallback to text files
txt_files = list(Path(output_dir).glob("**/txt/*.txt"))
if txt_files:
with open(txt_files[0], 'r', encoding='utf-8') as f:
markdown_content = f.read()
else:
markdown_content = "# Conversion completed but no markdown found"
# Clean up
os.unlink(pdf_path)
return {
"markdown": markdown_content,
"filename": filename,
"status": "success",
"pages": len(markdown_content.split('\n---\n')) # Rough page count
}
except Exception as conv_error:
logger.error(f"Conversion error: {str(conv_error)}")
return {
"error": f"Conversion failed: {str(conv_error)}",
"filename": filename,
"status": "failed"
}
except Exception as e:
logger.error(f"Handler error: {str(e)}")
return {
"error": str(e),
"status": "failed"
}
# RunPod serverless entrypoint
runpod.serverless.start({"handler": handler})