File size: 8,846 Bytes
5e1a30c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""
Weaviate backend configuration schema.

This module provides configuration classes for the Weaviate backend
adapter, including connection settings, schema definitions, and
search parameters.
"""

from dataclasses import dataclass, field
from typing import Dict, Any, Optional, List
from pathlib import Path


@dataclass
class WeaviateConnectionConfig:
    """Configuration for Weaviate connection."""
    
    url: str = "http://localhost:8080"
    api_key: Optional[str] = None
    timeout: int = 30
    startup_period: int = 5
    additional_headers: Dict[str, str] = field(default_factory=dict)
    
    def __post_init__(self):
        """Validate connection configuration."""
        if not self.url:
            raise ValueError("Weaviate URL cannot be empty")
        if self.timeout <= 0:
            raise ValueError("Timeout must be positive")
        if self.startup_period < 0:
            raise ValueError("Startup period cannot be negative")


@dataclass
class WeaviateSchemaConfig:
    """Configuration for Weaviate schema."""
    
    class_name: str = "TechnicalDocument"
    description: str = "Technical documentation chunks with embeddings"
    vector_index_config: Dict[str, Any] = field(default_factory=lambda: {
        "distance": "cosine",
        "ef": 64,
        "efConstruction": 128,
        "maxConnections": 64
    })
    properties: List[Dict[str, Any]] = field(default_factory=lambda: [
        {
            "name": "content",
            "dataType": ["text"],
            "description": "The main text content of the document chunk"
        },
        {
            "name": "source_file",
            "dataType": ["text"],
            "description": "Original source file path"
        },
        {
            "name": "chunk_index",
            "dataType": ["int"],
            "description": "Index of this chunk within the source document"
        },
        {
            "name": "page_number",
            "dataType": ["int"],
            "description": "Page number in the original document"
        },
        {
            "name": "chunk_size",
            "dataType": ["int"],
            "description": "Size of the chunk in characters"
        },
        {
            "name": "created_at",
            "dataType": ["date"],
            "description": "When this chunk was processed"
        }
    ])
    
    def __post_init__(self):
        """Validate schema configuration."""
        if not self.class_name:
            raise ValueError("Class name cannot be empty")
        if not self.class_name.isalnum():
            raise ValueError("Class name must be alphanumeric")
        if not self.properties:
            raise ValueError("Properties list cannot be empty")


@dataclass
class WeaviateSearchConfig:
    """Configuration for Weaviate search operations."""
    
    hybrid_search_enabled: bool = True
    alpha: float = 0.7  # Balance between vector and keyword search (0=keyword, 1=vector)
    fusion_type: str = "rankedFusion"  # or "relativeScoreFusion"
    limit: int = 100
    offset: int = 0
    autocut: int = 1  # Enable autocut
    certainty_threshold: float = 0.7
    distance_threshold: Optional[float] = None
    
    def __post_init__(self):
        """Validate search configuration."""
        if not 0 <= self.alpha <= 1:
            raise ValueError("Alpha must be between 0 and 1")
        if self.limit <= 0:
            raise ValueError("Limit must be positive")
        if self.offset < 0:
            raise ValueError("Offset cannot be negative")
        if not 0 <= self.certainty_threshold <= 1:
            raise ValueError("Certainty threshold must be between 0 and 1")
        if self.distance_threshold is not None and self.distance_threshold < 0:
            raise ValueError("Distance threshold cannot be negative")


@dataclass
class WeaviateBatchConfig:
    """Configuration for Weaviate batch operations."""
    
    batch_size: int = 100
    num_workers: int = 1
    connection_error_retries: int = 3
    timeout_retries: int = 3
    callback_period: int = 1000
    dynamic_batch_size: bool = True
    min_batch_size: int = 10
    max_batch_size: int = 1000
    
    def __post_init__(self):
        """Validate batch configuration."""
        if self.batch_size <= 0:
            raise ValueError("Batch size must be positive")
        if self.num_workers <= 0:
            raise ValueError("Number of workers must be positive")
        if self.connection_error_retries < 0:
            raise ValueError("Connection error retries cannot be negative")
        if self.timeout_retries < 0:
            raise ValueError("Timeout retries cannot be negative")
        if self.min_batch_size <= 0:
            raise ValueError("Min batch size must be positive")
        if self.max_batch_size < self.min_batch_size:
            raise ValueError("Max batch size must be >= min batch size")


@dataclass
class WeaviateBackendConfig:
    """Complete configuration for Weaviate backend."""
    
    connection: WeaviateConnectionConfig = field(default_factory=WeaviateConnectionConfig)
    schema: WeaviateSchemaConfig = field(default_factory=WeaviateSchemaConfig)
    search: WeaviateSearchConfig = field(default_factory=WeaviateSearchConfig)
    batch: WeaviateBatchConfig = field(default_factory=WeaviateBatchConfig)
    
    # Backend-specific settings
    auto_create_schema: bool = True
    enable_backup: bool = True
    backup_interval_hours: int = 24
    max_retries: int = 3
    retry_delay_seconds: float = 1.0
    
    def __post_init__(self):
        """Validate complete backend configuration."""
        if self.max_retries < 0:
            raise ValueError("Max retries cannot be negative")
        if self.retry_delay_seconds < 0:
            raise ValueError("Retry delay cannot be negative")
        if self.backup_interval_hours <= 0:
            raise ValueError("Backup interval must be positive")
    
    @classmethod
    def from_dict(cls, config_dict: Dict[str, Any]) -> 'WeaviateBackendConfig':
        """Create configuration from dictionary."""
        connection_config = WeaviateConnectionConfig(**config_dict.get('connection', {}))
        schema_config = WeaviateSchemaConfig(**config_dict.get('schema', {}))
        search_config = WeaviateSearchConfig(**config_dict.get('search', {}))
        batch_config = WeaviateBatchConfig(**config_dict.get('batch', {}))
        
        # Extract backend-specific settings
        backend_settings = {
            k: v for k, v in config_dict.items()
            if k not in ['connection', 'schema', 'search', 'batch']
        }
        
        return cls(
            connection=connection_config,
            schema=schema_config,
            search=search_config,
            batch=batch_config,
            **backend_settings
        )
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert configuration to dictionary."""
        return {
            'connection': {
                'url': self.connection.url,
                'api_key': self.connection.api_key,
                'timeout': self.connection.timeout,
                'startup_period': self.connection.startup_period,
                'additional_headers': self.connection.additional_headers
            },
            'schema': {
                'class_name': self.schema.class_name,
                'description': self.schema.description,
                'vector_index_config': self.schema.vector_index_config,
                'properties': self.schema.properties
            },
            'search': {
                'hybrid_search_enabled': self.search.hybrid_search_enabled,
                'alpha': self.search.alpha,
                'fusion_type': self.search.fusion_type,
                'limit': self.search.limit,
                'offset': self.search.offset,
                'autocut': self.search.autocut,
                'certainty_threshold': self.search.certainty_threshold,
                'distance_threshold': self.search.distance_threshold
            },
            'batch': {
                'batch_size': self.batch.batch_size,
                'num_workers': self.batch.num_workers,
                'connection_error_retries': self.batch.connection_error_retries,
                'timeout_retries': self.batch.timeout_retries,
                'callback_period': self.batch.callback_period,
                'dynamic_batch_size': self.batch.dynamic_batch_size,
                'min_batch_size': self.batch.min_batch_size,
                'max_batch_size': self.batch.max_batch_size
            },
            'auto_create_schema': self.auto_create_schema,
            'enable_backup': self.enable_backup,
            'backup_interval_hours': self.backup_interval_hours,
            'max_retries': self.max_retries,
            'retry_delay_seconds': self.retry_delay_seconds
        }