Spaces:
Running
Running
import pytest | |
import numpy as np | |
import sys | |
from pathlib import Path | |
# Add project paths | |
project_root = Path(__file__).parent.parent | |
sys.path.append(str(project_root)) | |
sys.path.append(str(project_root.parent)) | |
from src.shared_utils.document_processing.pdf_parser import extract_text_with_metadata | |
from src.shared_utils.document_processing.chunker import chunk_technical_text | |
from src.shared_utils.embeddings.generator import generate_embeddings | |
def test_full_pipeline(): | |
"""Test complete document processing pipeline.""" | |
pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf") | |
# Step 1: Extract text | |
doc_data = extract_text_with_metadata(pdf_path) | |
assert len(doc_data["text"]) > 1000 | |
# Step 2: Chunk text | |
chunks = chunk_technical_text(doc_data["text"], chunk_size=512, overlap=50) | |
assert len(chunks) > 10 # Should produce many chunks | |
# Step 3: Generate embeddings | |
chunk_texts = [chunk["text"] for chunk in chunks] | |
embeddings = generate_embeddings(chunk_texts) | |
assert embeddings.shape[0] == len(chunks) | |
assert embeddings.shape[1] == 384 | |
print(f"Pipeline processed {len(chunks)} chunks in total") | |
return {"chunks": chunks, "embeddings": embeddings} | |
def test_pipeline_performance(): | |
"""Test end-to-end performance.""" | |
import time | |
pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf") | |
start = time.perf_counter() | |
# Full pipeline | |
doc_data = extract_text_with_metadata(pdf_path) | |
chunks = chunk_technical_text(doc_data["text"]) | |
chunk_texts = [chunk["text"] for chunk in chunks[:100]] # Limit for test | |
embeddings = generate_embeddings(chunk_texts) | |
duration = time.perf_counter() - start | |
print(f"Processed {len(chunk_texts)} chunks in {duration:.2f}s") | |
assert duration < 10.0 # Should complete quickly | |