File size: 1,473 Bytes
e4d5155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""
Base classes for context chunking components.
"""

from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional

class Chunk:
    """Representation of a text chunk with metadata."""
    
    def __init__(
        self,
        content: str,
        chunk_id: str,
        document_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
    ):
        """
        Initialize a chunk.
        
        Args:
            content: The text content of the chunk
            chunk_id: Unique identifier for the chunk
            document_id: Optional ID of the source document
            metadata: Optional metadata for the chunk
        """
        self.content = content
        self.chunk_id = chunk_id
        self.document_id = document_id
        self.metadata = metadata or {}
        self.embedding = None

class BaseChunker(ABC):
    """Base class for content chunking components."""
    
    @abstractmethod
    def chunk(
        self, 
        content: str, 
        metadata: Optional[Dict[str, Any]] = None,
        document_id: Optional[str] = None
    ) -> List[Chunk]:
        """
        Split content into chunks.
        
        Args:
            content: Content to be chunked
            metadata: Optional metadata to associate with chunks
            document_id: Optional document ID to associate with chunks
            
        Returns:
            chunks: List of Chunk objects
        """
        pass