Spaces:

ArthyP
/

technical-rag-assistant

Running

technical-rag-assistant / src /batch_document_processor.py

Arthur Passuello

Debug

163ed2c 2 months ago

21.6 kB

	#!/usr/bin/env python3
	"""
	Batch Document Processing for Scaled Knowledge Base

	This module handles efficient processing of multiple documents (20-50+)
	to significantly expand the RAG system's knowledge coverage.
	"""

	import sys
	from pathlib import Path
	import json
	import time
	import logging
	from typing import List, Dict, Any, Optional, Tuple
	from dataclasses import dataclass
	from datetime import datetime
	import threading
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import hashlib

	# Add project root to path
	project_root = Path(__file__).parent.parent.parent
	sys.path.insert(0, str(project_root))
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from src.rag_with_generation import RAGWithGeneration
	from src.shared_utils.document_processing.hybrid_parser import HybridParser

	logger = logging.getLogger(__name__)


	@dataclass
	class DocumentProcessingResult:
	"""Result of processing a single document."""
	filename: str
	success: bool
	chunks_created: int
	processing_time: float
	file_size_mb: float
	error_message: Optional[str] = None
	document_metadata: Optional[Dict[str, Any]] = None


	@dataclass
	class BatchProcessingStats:
	"""Statistics for batch processing operation."""
	total_documents: int
	successful_documents: int
	failed_documents: int
	total_chunks: int
	total_processing_time: float
	total_size_mb: float
	avg_chunks_per_doc: float
	avg_processing_time_per_doc: float
	documents_per_minute: float


	class BatchDocumentProcessor:
	"""
	Efficiently processes multiple documents for scaled knowledge base.

	Optimized for 20-50 documents with memory management and parallel processing.
	"""

	def __init__(
	self,
	rag_system: Optional[RAGWithGeneration] = None,
	max_workers: int = 4,
	memory_limit_mb: int = 2048,
	chunk_batch_size: int = 100
	):
	"""
	Initialize batch processor.

	Args:
	rag_system: Existing RAG system or None to create new
	max_workers: Maximum parallel document processing threads
	memory_limit_mb: Memory limit for processing
	chunk_batch_size: Batch size for chunk indexing
	"""
	self.rag_system = rag_system or RAGWithGeneration()
	self.max_workers = max_workers
	self.memory_limit_mb = memory_limit_mb
	self.chunk_batch_size = chunk_batch_size

	# Processing statistics
	self.processing_results: List[DocumentProcessingResult] = []
	self.total_chunks_processed = 0

	# Document tracking
	self.processed_documents: Dict[str, str] = {} # filename -> content_hash
	self.document_metadata: Dict[str, Dict[str, Any]] = {}

	logger.info(f"BatchDocumentProcessor initialized with {max_workers} workers")

	def calculate_document_hash(self, file_path: Path) -> str:
	"""Calculate hash of document content for duplicate detection."""
	try:
	with open(file_path, 'rb') as f:
	content = f.read()
	return hashlib.md5(content).hexdigest()
	except Exception as e:
	logger.warning(f"Failed to calculate hash for {file_path}: {e}")
	return str(file_path)

	def process_single_document(
	self,
	file_path: Path,
	skip_duplicates: bool = True
	) -> DocumentProcessingResult:
	"""
	Process a single document with error handling and statistics.

	Args:
	file_path: Path to document
	skip_duplicates: Skip if document already processed

	Returns:
	DocumentProcessingResult with processing statistics
	"""
	start_time = time.time()
	filename = file_path.name

	try:
	# Get file size
	file_size_mb = file_path.stat().st_size / (1024 * 1024)

	# Check for duplicates
	if skip_duplicates:
	content_hash = self.calculate_document_hash(file_path)
	if content_hash in self.processed_documents.values():
	logger.info(f"Skipping duplicate document: {filename}")
	return DocumentProcessingResult(
	filename=filename,
	success=True,
	chunks_created=0,
	processing_time=time.time() - start_time,
	file_size_mb=file_size_mb,
	error_message="Duplicate skipped"
	)

	# Process document
	logger.info(f"Processing document: {filename} ({file_size_mb:.2f} MB)")

	# Use the RAG system's index_document method which handles the hybrid parser correctly
	original_chunk_count = len(self.rag_system.chunks)
	chunk_count = self.rag_system.index_document(file_path)

	if chunk_count == 0:
	raise ValueError("No chunks extracted from document")

	# Get the newly added chunks
	new_chunks = self.rag_system.chunks[original_chunk_count:]
	chunks = new_chunks
	chunks_created = len(chunks)

	# Chunks are already added by rag_system.index_document()
	if chunks_created > 0:

	# Store metadata
	self.document_metadata[filename] = {
	'file_path': str(file_path),
	'chunks_count': chunks_created,
	'file_size_mb': file_size_mb,
	'processing_time': time.time() - start_time,
	'content_hash': content_hash if skip_duplicates else None,
	'processed_at': datetime.now().isoformat()
	}

	if skip_duplicates:
	self.processed_documents[filename] = content_hash

	processing_time = time.time() - start_time

	logger.info(f"Successfully processed {filename}: {chunks_created} chunks in {processing_time:.2f}s")

	return DocumentProcessingResult(
	filename=filename,
	success=True,
	chunks_created=chunks_created,
	processing_time=processing_time,
	file_size_mb=file_size_mb,
	document_metadata=self.document_metadata.get(filename)
	)

	except Exception as e:
	processing_time = time.time() - start_time
	error_msg = str(e)

	logger.error(f"Failed to process {filename}: {error_msg}")

	return DocumentProcessingResult(
	filename=filename,
	success=False,
	chunks_created=0,
	processing_time=processing_time,
	file_size_mb=file_size_mb if 'file_size_mb' in locals() else 0.0,
	error_message=error_msg
	)

	def _add_chunks_to_rag_system(self, chunks: List[Dict[str, Any]], source_filename: str):
	"""Add chunks to RAG system in batches for memory efficiency."""
	try:
	# Process chunks in batches
	for i in range(0, len(chunks), self.chunk_batch_size):
	batch = chunks[i:i + self.chunk_batch_size]

	# Add source information to each chunk
	for chunk in batch:
	if 'metadata' not in chunk:
	chunk['metadata'] = {}
	chunk['metadata']['batch_source'] = source_filename

	# Add batch to RAG system
	self.rag_system.chunks.extend(batch)
	self.total_chunks_processed += len(batch)

	logger.debug(f"Added batch of {len(batch)} chunks from {source_filename}")

	# Rebuild indices after adding all chunks from document
	self._rebuild_indices_if_needed()

	except Exception as e:
	logger.error(f"Failed to add chunks from {source_filename} to RAG system: {e}")
	raise

	def _rebuild_indices_if_needed(self):
	"""Rebuild search indices periodically for efficiency."""
	# Rebuild every 1000 chunks to maintain performance
	if self.total_chunks_processed % 1000 == 0 and self.total_chunks_processed > 0:
	logger.info(f"Rebuilding indices after {self.total_chunks_processed} chunks")
	try:
	# Rebuild dense index
	if hasattr(self.rag_system, 'dense_index'):
	self.rag_system._build_dense_index()

	# Rebuild sparse index
	if hasattr(self.rag_system, 'sparse_retriever'):
	self.rag_system._build_sparse_index()

	logger.info("Indices rebuilt successfully")
	except Exception as e:
	logger.warning(f"Failed to rebuild indices: {e}")

	def process_document_collection(
	self,
	document_paths: List[Path],
	parallel: bool = True,
	skip_duplicates: bool = True,
	progress_callback: Optional[callable] = None
	) -> BatchProcessingStats:
	"""
	Process a collection of documents efficiently.

	Args:
	document_paths: List of document file paths
	parallel: Use parallel processing
	skip_duplicates: Skip duplicate documents
	progress_callback: Optional callback for progress updates

	Returns:
	BatchProcessingStats with comprehensive metrics
	"""
	start_time = time.time()
	self.processing_results = []

	logger.info(f"Starting batch processing of {len(document_paths)} documents")
	logger.info(f"Parallel processing: {parallel}, Max workers: {self.max_workers}")

	if parallel and len(document_paths) > 1:
	# Parallel processing
	with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	# Submit all tasks
	future_to_path = {
	executor.submit(self.process_single_document, path, skip_duplicates): path
	for path in document_paths
	}

	# Collect results as they complete
	for future in as_completed(future_to_path):
	path = future_to_path[future]
	try:
	result = future.result()
	self.processing_results.append(result)

	if progress_callback:
	progress_callback(len(self.processing_results), len(document_paths))

	except Exception as e:
	logger.error(f"Failed to process {path}: {e}")
	self.processing_results.append(
	DocumentProcessingResult(
	filename=path.name,
	success=False,
	chunks_created=0,
	processing_time=0.0,
	file_size_mb=0.0,
	error_message=str(e)
	)
	)
	else:
	# Sequential processing
	for i, path in enumerate(document_paths):
	result = self.process_single_document(path, skip_duplicates)
	self.processing_results.append(result)

	if progress_callback:
	progress_callback(i + 1, len(document_paths))

	# Final index rebuild
	logger.info("Performing final index rebuild...")
	self._rebuild_final_indices()

	# Calculate statistics
	total_processing_time = time.time() - start_time
	stats = self._calculate_batch_stats(total_processing_time)

	logger.info(f"Batch processing completed: {stats.successful_documents}/{stats.total_documents} documents, "
	f"{stats.total_chunks} chunks in {stats.total_processing_time:.2f}s")

	return stats

	def _rebuild_final_indices(self):
	"""Rebuild all indices after batch processing is complete."""
	try:
	logger.info("Rebuilding final search indices...")

	# Rebuild dense embeddings index
	if hasattr(self.rag_system, '_build_dense_index'):
	self.rag_system._build_dense_index()

	# Rebuild sparse BM25 index
	if hasattr(self.rag_system, '_build_sparse_index'):
	self.rag_system._build_sparse_index()

	# Rebuild vocabulary index
	if hasattr(self.rag_system, '_build_vocabulary_index'):
	self.rag_system._build_vocabulary_index()

	logger.info(f"Final indices rebuilt for {len(self.rag_system.chunks)} total chunks")

	except Exception as e:
	logger.error(f"Failed to rebuild final indices: {e}")

	def _calculate_batch_stats(self, total_processing_time: float) -> BatchProcessingStats:
	"""Calculate comprehensive batch processing statistics."""
	successful_results = [r for r in self.processing_results if r.success]
	failed_results = [r for r in self.processing_results if not r.success]

	total_chunks = sum(r.chunks_created for r in successful_results)
	total_size_mb = sum(r.file_size_mb for r in self.processing_results)

	avg_chunks_per_doc = total_chunks / len(successful_results) if successful_results else 0
	avg_processing_time = sum(r.processing_time for r in successful_results) / len(successful_results) if successful_results else 0
	documents_per_minute = (len(self.processing_results) / total_processing_time) * 60 if total_processing_time > 0 else 0

	return BatchProcessingStats(
	total_documents=len(self.processing_results),
	successful_documents=len(successful_results),
	failed_documents=len(failed_results),
	total_chunks=total_chunks,
	total_processing_time=total_processing_time,
	total_size_mb=total_size_mb,
	avg_chunks_per_doc=avg_chunks_per_doc,
	avg_processing_time_per_doc=avg_processing_time,
	documents_per_minute=documents_per_minute
	)

	def get_processing_report(self) -> Dict[str, Any]:
	"""Generate comprehensive processing report."""
	if not self.processing_results:
	return {"error": "No processing results available"}

	stats = self._calculate_batch_stats(
	sum(r.processing_time for r in self.processing_results)
	)

	# Detailed results by document
	document_details = []
	for result in self.processing_results:
	document_details.append({
	"filename": result.filename,
	"success": result.success,
	"chunks_created": result.chunks_created,
	"processing_time": f"{result.processing_time:.2f}s",
	"file_size_mb": f"{result.file_size_mb:.2f}MB",
	"error": result.error_message,
	"chunks_per_mb": result.chunks_created / result.file_size_mb if result.file_size_mb > 0 else 0
	})

	# Processing efficiency metrics
	successful_results = [r for r in self.processing_results if r.success]
	processing_rates = [r.chunks_created / r.processing_time for r in successful_results if r.processing_time > 0]

	return {
	"summary": {
	"total_documents": stats.total_documents,
	"successful_documents": stats.successful_documents,
	"failed_documents": stats.failed_documents,
	"success_rate": f"{(stats.successful_documents / stats.total_documents * 100):.1f}%",
	"total_chunks": stats.total_chunks,
	"total_size_processed": f"{stats.total_size_mb:.2f}MB",
	"total_processing_time": f"{stats.total_processing_time:.2f}s",
	"documents_per_minute": f"{stats.documents_per_minute:.1f}",
	"avg_chunks_per_document": f"{stats.avg_chunks_per_doc:.1f}",
	"avg_processing_time_per_document": f"{stats.avg_processing_time_per_doc:.2f}s"
	},
	"efficiency": {
	"chunks_per_second": f"{stats.total_chunks / stats.total_processing_time:.1f}",
	"mb_per_second": f"{stats.total_size_mb / stats.total_processing_time:.2f}",
	"avg_chunks_per_second_per_doc": f"{sum(processing_rates) / len(processing_rates):.1f}" if processing_rates else "0"
	},
	"document_details": document_details,
	"failed_documents": [
	{"filename": r.filename, "error": r.error_message}
	for r in self.processing_results if not r.success
	],
	"system_status": {
	"total_chunks_in_system": len(self.rag_system.chunks),
	"unique_sources": len(set(self.document_metadata.keys())),
	"memory_usage_estimate": f"{len(self.rag_system.chunks) * 2:.0f}MB" # Rough estimate
	}
	}

	def save_processing_report(self, output_path: str):
	"""Save processing report to JSON file."""
	report = self.get_processing_report()
	report["generated_at"] = datetime.now().isoformat()
	report["processor_config"] = {
	"max_workers": self.max_workers,
	"memory_limit_mb": self.memory_limit_mb,
	"chunk_batch_size": self.chunk_batch_size
	}

	with open(output_path, 'w') as f:
	json.dump(report, f, indent=2)

	logger.info(f"Processing report saved to {output_path}")


	def process_documents_from_directory(
	directory_path: str,
	file_patterns: List[str] = ["*.pdf"],
	max_workers: int = 4,
	output_report: Optional[str] = None
	) -> Tuple[RAGWithGeneration, BatchProcessingStats]:
	"""
	Convenience function to process all documents in a directory.

	Args:
	directory_path: Path to directory containing documents
	file_patterns: List of file patterns to match (e.g., [".pdf", ".txt"])
	max_workers: Number of parallel workers
	output_report: Optional path to save processing report

	Returns:
	Tuple of (RAG system, processing statistics)
	"""
	directory = Path(directory_path)

	# Find all matching documents
	document_paths = []
	for pattern in file_patterns:
	document_paths.extend(directory.glob(pattern))

	if not document_paths:
	raise ValueError(f"No documents found in {directory_path} matching {file_patterns}")

	logger.info(f"Found {len(document_paths)} documents to process")

	# Initialize processor
	processor = BatchDocumentProcessor(max_workers=max_workers)

	# Process documents
	def progress_callback(completed, total):
	print(f"Progress: {completed}/{total} documents processed ({completed/total*100:.1f}%)")

	stats = processor.process_document_collection(
	document_paths,
	parallel=True,
	progress_callback=progress_callback
	)

	# Save report if requested
	if output_report:
	processor.save_processing_report(output_report)

	return processor.rag_system, stats


	if __name__ == "__main__":
	# Example usage
	print("🚀 Testing Batch Document Processing")
	print("=" * 50)

	# Test with existing test documents
	test_dir = Path("data/test")
	if test_dir.exists():
	print(f"Processing documents from: {test_dir}")

	try:
	rag_system, stats = process_documents_from_directory(
	str(test_dir),
	file_patterns=["*.pdf"],
	max_workers=2, # Conservative for testing
	output_report="batch_processing_report.json"
	)

	print(f"\n✅ Batch processing completed!")
	print(f" 📊 Documents: {stats.successful_documents}/{stats.total_documents}")
	print(f" 📚 Total chunks: {stats.total_chunks}")
	print(f" ⏱️ Processing time: {stats.total_processing_time:.2f}s")
	print(f" 🚀 Documents/minute: {stats.documents_per_minute:.1f}")

	# Test query on expanded knowledge base
	print(f"\n🔍 Testing query on expanded knowledge base...")
	result = rag_system.query_with_answer(
	question="What is RISC-V and what are its main principles?",
	top_k=5,
	use_hybrid=True
	)

	print(f" Confidence: {result['confidence']:.1%}")
	print(f" Citations: {len(result['citations'])}")
	print(f" Sources: {set(c['source'] for c in result['citations'])}")

	except Exception as e:
	print(f"❌ Error during batch processing: {e}")
	else:
	print(f"❌ Test directory not found: {test_dir}")