import fitz # PyMuPDF import requests from io import BytesIO from concurrent.futures import ThreadPoolExecutor def _extract_text(page): text = page.get_text() return text.strip() if text and text.strip() else None def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1): """ Download PDF from URL, extract text in parallel, optionally chunk pages. """ res = requests.get(url) with fitz.open(stream=BytesIO(res.content), filetype="pdf") as doc: num_pages = len(doc) pages = list(doc) # Step 1: Parallel text extraction with ThreadPoolExecutor(max_workers=max_workers) as executor: texts = list(executor.map(_extract_text, pages)) # Step 2: Optional chunking if chunk_size > 1: chunks = [] for i in range(0, len(texts), chunk_size): chunk = ' '.join([t for t in texts[i:i+chunk_size] if t]) if chunk: chunks.append(chunk) return chunks # Default: return one chunk per page return [t for t in texts if t] def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1): """ Parse a local PDF file, extract text in parallel, optionally chunk pages. """ with fitz.open(file_path) as doc: num_pages = len(doc) pages = list(doc) # Step 1: Parallel text extraction with ThreadPoolExecutor(max_workers=max_workers) as executor: texts = list(executor.map(_extract_text, pages)) # Step 2: Optional chunking if chunk_size > 1: chunks = [] for i in range(0, len(texts), chunk_size): chunk = ' '.join([t for t in texts[i:i+chunk_size] if t]) if chunk: chunks.append(chunk) return chunks return [t for t in texts if t]