Spaces:
Runtime error
Runtime error
import fitz # PyMuPDF | |
import requests | |
from io import BytesIO | |
from concurrent.futures import ThreadPoolExecutor | |
def _extract_text(page): | |
text = page.get_text() | |
return text.strip() if text and text.strip() else None | |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1): | |
""" | |
Download PDF from URL, extract text in parallel, optionally chunk pages. | |
""" | |
res = requests.get(url) | |
with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc: | |
num_pages = len(doc) | |
pages = list(doc) | |
# Step 1: Parallel text extraction | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
texts = list(executor.map(_extract_text, pages)) | |
# Step 2: Optional chunking | |
if chunk_size > 1: | |
chunks = [] | |
for i in range(0, len(texts), chunk_size): | |
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t]) | |
if chunk: | |
chunks.append(chunk) | |
return chunks | |
# Default: return one chunk per page | |
return [t for t in texts if t] | |
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1): | |
""" | |
Parse a local PDF file, extract text in parallel, optionally chunk pages. | |
""" | |
with fitz.open(file_path) as doc: | |
num_pages = len(doc) | |
pages = list(doc) | |
# Step 1: Parallel text extraction | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
texts = list(executor.map(_extract_text, pages)) | |
# Step 2: Optional chunking | |
if chunk_size > 1: | |
chunks = [] | |
for i in range(0, len(texts), chunk_size): | |
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t]) | |
if chunk: | |
chunks.append(chunk) | |
return chunks | |
return [t for t in texts if t] | |