Rivalcoder
Add First basic Version
192b91e
import fitz # PyMuPDF
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
def _extract_text(page):
text = page.get_text()
return text.strip() if text and text.strip() else None
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
"""
Download PDF from URL, extract text in parallel, optionally chunk pages.
"""
res = requests.get(url)
with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc:
num_pages = len(doc)
pages = list(doc)
# Step 1: Parallel text extraction
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
# Step 2: Optional chunking
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks
# Default: return one chunk per page
return [t for t in texts if t]
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
"""
Parse a local PDF file, extract text in parallel, optionally chunk pages.
"""
with fitz.open(file_path) as doc:
num_pages = len(doc)
pages = list(doc)
# Step 1: Parallel text extraction
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
# Step 2: Optional chunking
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks
return [t for t in texts if t]