Spaces:
Runtime error
Runtime error
File size: 1,901 Bytes
192b91e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import fitz # PyMuPDF
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
def _extract_text(page):
text = page.get_text()
return text.strip() if text and text.strip() else None
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
"""
Download PDF from URL, extract text in parallel, optionally chunk pages.
"""
res = requests.get(url)
with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
num_pages = len(doc)
pages = list(doc)
# Step 1: Parallel text extraction
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
# Step 2: Optional chunking
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks
# Default: return one chunk per page
return [t for t in texts if t]
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
"""
Parse a local PDF file, extract text in parallel, optionally chunk pages.
"""
with fitz.open(file_path) as doc:
num_pages = len(doc)
pages = list(doc)
# Step 1: Parallel text extraction
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
# Step 2: Optional chunking
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks
return [t for t in texts if t]
|