Spaces:
Running
Running
File size: 1,872 Bytes
9f5e57c 163ed2c 9f5e57c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import pytest
import numpy as np
import sys
from pathlib import Path
# Add project paths
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
sys.path.append(str(project_root.parent))
from src.shared_utils.document_processing.pdf_parser import extract_text_with_metadata
from src.shared_utils.document_processing.chunker import chunk_technical_text
from src.shared_utils.embeddings.generator import generate_embeddings
def test_full_pipeline():
"""Test complete document processing pipeline."""
pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf")
# Step 1: Extract text
doc_data = extract_text_with_metadata(pdf_path)
assert len(doc_data["text"]) > 1000
# Step 2: Chunk text
chunks = chunk_technical_text(doc_data["text"], chunk_size=512, overlap=50)
assert len(chunks) > 10 # Should produce many chunks
# Step 3: Generate embeddings
chunk_texts = [chunk["text"] for chunk in chunks]
embeddings = generate_embeddings(chunk_texts)
assert embeddings.shape[0] == len(chunks)
assert embeddings.shape[1] == 384
print(f"Pipeline processed {len(chunks)} chunks in total")
return {"chunks": chunks, "embeddings": embeddings}
def test_pipeline_performance():
"""Test end-to-end performance."""
import time
pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf")
start = time.perf_counter()
# Full pipeline
doc_data = extract_text_with_metadata(pdf_path)
chunks = chunk_technical_text(doc_data["text"])
chunk_texts = [chunk["text"] for chunk in chunks[:100]] # Limit for test
embeddings = generate_embeddings(chunk_texts)
duration = time.perf_counter() - start
print(f"Processed {len(chunk_texts)} chunks in {duration:.2f}s")
assert duration < 10.0 # Should complete quickly
|