import pytest import numpy as np import sys from pathlib import Path # Add project paths project_root = Path(__file__).parent.parent sys.path.append(str(project_root)) sys.path.append(str(project_root.parent)) from src.shared_utils.document_processing.pdf_parser import extract_text_with_metadata from src.shared_utils.document_processing.chunker import chunk_technical_text from src.shared_utils.embeddings.generator import generate_embeddings def test_full_pipeline(): """Test complete document processing pipeline.""" pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf") # Step 1: Extract text doc_data = extract_text_with_metadata(pdf_path) assert len(doc_data["text"]) > 1000 # Step 2: Chunk text chunks = chunk_technical_text(doc_data["text"], chunk_size=512, overlap=50) assert len(chunks) > 10 # Should produce many chunks # Step 3: Generate embeddings chunk_texts = [chunk["text"] for chunk in chunks] embeddings = generate_embeddings(chunk_texts) assert embeddings.shape[0] == len(chunks) assert embeddings.shape[1] == 384 print(f"Pipeline processed {len(chunks)} chunks in total") return {"chunks": chunks, "embeddings": embeddings} def test_pipeline_performance(): """Test end-to-end performance.""" import time pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf") start = time.perf_counter() # Full pipeline doc_data = extract_text_with_metadata(pdf_path) chunks = chunk_technical_text(doc_data["text"]) chunk_texts = [chunk["text"] for chunk in chunks[:100]] # Limit for test embeddings = generate_embeddings(chunk_texts) duration = time.perf_counter() - start print(f"Processed {len(chunk_texts)} chunks in {duration:.2f}s") assert duration < 10.0 # Should complete quickly