File size: 1,872 Bytes
9f5e57c
 
 
 
 
 
 
 
 
 
163ed2c
 
 
9f5e57c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import pytest
import numpy as np
import sys
from pathlib import Path

# Add project paths
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
sys.path.append(str(project_root.parent))

from src.shared_utils.document_processing.pdf_parser import extract_text_with_metadata
from src.shared_utils.document_processing.chunker import chunk_technical_text
from src.shared_utils.embeddings.generator import generate_embeddings


def test_full_pipeline():
    """Test complete document processing pipeline."""
    pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf")

    # Step 1: Extract text
    doc_data = extract_text_with_metadata(pdf_path)
    assert len(doc_data["text"]) > 1000

    # Step 2: Chunk text
    chunks = chunk_technical_text(doc_data["text"], chunk_size=512, overlap=50)
    assert len(chunks) > 10  # Should produce many chunks

    # Step 3: Generate embeddings
    chunk_texts = [chunk["text"] for chunk in chunks]
    embeddings = generate_embeddings(chunk_texts)

    assert embeddings.shape[0] == len(chunks)
    assert embeddings.shape[1] == 384

    print(f"Pipeline processed {len(chunks)} chunks in total")
    return {"chunks": chunks, "embeddings": embeddings}


def test_pipeline_performance():
    """Test end-to-end performance."""
    import time

    pdf_path = Path("project-1-technical-rag/data/test/riscv-base-instructions.pdf")

    start = time.perf_counter()

    # Full pipeline
    doc_data = extract_text_with_metadata(pdf_path)
    chunks = chunk_technical_text(doc_data["text"])
    chunk_texts = [chunk["text"] for chunk in chunks[:100]]  # Limit for test
    embeddings = generate_embeddings(chunk_texts)

    duration = time.perf_counter() - start

    print(f"Processed {len(chunk_texts)} chunks in {duration:.2f}s")
    assert duration < 10.0  # Should complete quickly