gabrielaltay commited on
Commit
9b6148b
Β·
1 Parent(s): eeef8f5
src/legisqa_local/config/settings.py CHANGED
@@ -3,6 +3,8 @@
3
  import os
4
  import logging
5
  import streamlit as st
 
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
@@ -12,6 +14,10 @@ STREAMLIT_CONFIG = {
12
  "page_title": "LegisQA"
13
  }
14
 
 
 
 
 
15
  def get_secret(key: str, default=None):
16
  """Get secret from Streamlit secrets or environment variables"""
17
  try:
@@ -33,253 +39,26 @@ def setup_environment():
33
  def get_chroma_config():
34
  """Get ChromaDB configuration from environment variables"""
35
  return {
36
- "persist_directory": os.getenv("CHROMA_PERSIST_DIRECTORY", "./chromadb"),
37
  "collection_name": os.getenv("CHROMA_COLLECTION_NAME", "usc")
38
  }
39
 
40
- def setup_chromadb():
41
- """Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
42
- # Use a file-based lock to prevent duplicate runs (more reliable than session state in HF Spaces)
43
- lock_file = "/tmp/chromadb_setup.lock"
44
-
45
- # Check if setup is already in progress or complete
46
- if os.path.exists(lock_file):
47
- logger.info("ChromaDB setup already in progress or complete (lock file exists)")
48
-
49
- # Even if setup is complete, ensure environment variable is correctly set
50
- # Check if ChromaDB exists at /data/chromadb (HF Spaces persistent storage)
51
- if os.path.exists("/data/chromadb") and os.listdir("/data/chromadb"):
52
- logger.info("πŸ”§ Ensuring CHROMA_PERSIST_DIRECTORY points to /data/chromadb")
53
- os.environ["CHROMA_PERSIST_DIRECTORY"] = "/data/chromadb"
54
-
55
- chroma_config = get_chroma_config()
56
- return chroma_config["persist_directory"]
57
-
58
- # Create lock file
59
- try:
60
- with open(lock_file, 'w') as f:
61
- f.write("ChromaDB setup in progress")
62
- logger.info("Created ChromaDB setup lock file")
63
- except Exception as e:
64
- logger.warning(f"Could not create lock file: {e}")
65
- # Continue anyway
66
-
67
- logger.info("=== ChromaDB Setup Starting ===")
68
-
69
- chroma_config = get_chroma_config()
70
- chroma_path = chroma_config["persist_directory"]
71
- logger.info(f"Initial ChromaDB path: {chroma_path}")
72
-
73
- # Check for force refresh flag
74
- force_refresh = os.getenv("CHROMA_FORCE_REFRESH", "").lower() in ("true", "1", "yes")
75
- if force_refresh:
76
- logger.info("πŸ”„ CHROMA_FORCE_REFRESH enabled - will clear and re-download ChromaDB")
77
-
78
- # For HF Spaces with persistent storage, prefer /data directory
79
- persistent_chroma_path = "/data/chromadb"
80
-
81
- # Check if we're in HF Spaces with persistent storage
82
- if os.path.exists("/data"):
83
- logger.info("πŸš€ HF Spaces persistent storage detected at /data")
84
-
85
- # Always clear /data completely to free up maximum space
86
- logger.info("🧹 Clearing entire /data directory to free up space...")
87
- import shutil
88
- try:
89
- # Check initial disk space
90
- statvfs = os.statvfs("/data")
91
- free_space_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024 * 1024 * 1024)
92
- logger.info(f"πŸ’Ύ Initial free space: {free_space_gb:.2f} GB")
93
-
94
- for item in os.listdir("/data"):
95
- item_path = os.path.join("/data", item)
96
- if os.path.isdir(item_path):
97
- shutil.rmtree(item_path)
98
- logger.info(f" Removed directory: {item}")
99
- else:
100
- os.remove(item_path)
101
- logger.info(f" Removed file: {item}")
102
-
103
- # Check free space after cleanup
104
- statvfs = os.statvfs("/data")
105
- free_space_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024 * 1024 * 1024)
106
- logger.info(f"βœ… /data directory cleared successfully")
107
- logger.info(f"πŸ’Ύ Free space after cleanup: {free_space_gb:.2f} GB")
108
- except Exception as e:
109
- logger.error(f"❌ Error clearing /data directory: {e}")
110
- logger.info("Continuing with download anyway...")
111
-
112
- # ChromaDB will always need to be downloaded fresh now
113
-
114
- # ChromaDB not found in persistent storage, try to download from HF Dataset
115
- logger.info("ChromaDB not found in persistent storage, checking HF Dataset configuration...")
116
-
117
- dataset_repo = os.getenv("CHROMA_DATASET_REPO", "hyperdemocracy/usc-chroma-vecs-v1-chunks-v1-s8192-o512-sentence-transformers-static-retrieval-mrl-en-v1")
118
- logger.info(f"HF Dataset repo: {dataset_repo}")
119
-
120
- if dataset_repo:
121
- logger.info(f"πŸ“₯ Downloading ChromaDB from HF Dataset to persistent storage...")
122
- logger.info(f" Source: {dataset_repo}")
123
- logger.info(f" Target: {persistent_chroma_path}")
124
-
125
- success = download_chromadb_from_hf_dataset(dataset_repo, persistent_chroma_path)
126
- if success:
127
- # Update environment variable to point to persistent storage
128
- os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
129
- logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
130
- # Inspect the downloaded ChromaDB
131
- inspect_chromadb(persistent_chroma_path)
132
- return persistent_chroma_path
133
- else:
134
- logger.error("❌ ChromaDB download from HF Dataset failed!")
135
- else:
136
- logger.error("❌ No HF Dataset configuration found (CHROMA_DATASET_REPO)")
137
- logger.info("Available environment variables:")
138
- for key, value in os.environ.items():
139
- if "CHROMA" in key:
140
- logger.info(f" {key}={value}")
141
- else:
142
- logger.info("No /data directory found (not in HF Spaces with persistent storage)")
143
-
144
- # Fallback: check if ChromaDB exists at configured path (local development)
145
- logger.info(f"Checking fallback path: {chroma_path}")
146
- if os.path.exists(chroma_path):
147
- if os.listdir(chroma_path):
148
- logger.info(f"βœ… ChromaDB found at {chroma_path}")
149
- # Inspect the fallback ChromaDB
150
- inspect_chromadb(chroma_path)
151
- return chroma_path
152
- else:
153
- logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
154
- else:
155
- logger.warning(f"ChromaDB directory does not exist: {chroma_path}")
156
-
157
- logger.warning(f"⚠️ Using default ChromaDB path: {chroma_path}")
158
- logger.info("=== ChromaDB Setup Complete ===")
159
- return chroma_path
160
 
161
- def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
162
- """Download ChromaDB from HuggingFace Dataset"""
163
- logger.info(f"Starting HF Dataset download: {dataset_repo} -> {local_path}")
164
-
165
- try:
166
- from huggingface_hub import snapshot_download
167
- import os
168
-
169
- # Ensure target directory exists
170
- logger.info(f"Creating target directory: {local_path}")
171
- os.makedirs(local_path, exist_ok=True)
172
-
173
- # Download the dataset using snapshot_download
174
- logger.info(f"Downloading dataset: {dataset_repo}")
175
- logger.info("This may take several minutes for large datasets...")
176
-
177
- # Download to a unique temporary location first to avoid conflicts
178
- import uuid
179
- temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
180
-
181
- # Use /tmp for cache (we cleared /data completely)
182
- cache_dir = "/tmp/hf_chromadb_cache"
183
-
184
- downloaded_path = snapshot_download(
185
- repo_id=dataset_repo,
186
- repo_type="dataset",
187
- local_dir=temp_download_path,
188
- cache_dir=cache_dir
189
- # Note: resume_download and local_dir_use_symlinks are now handled automatically
190
- )
191
-
192
- logger.info(f"βœ… ChromaDB download from HF Dataset complete!")
193
- logger.info(f"Downloaded to: {downloaded_path}")
194
-
195
- # The HF dataset contains a 'chromadb' subdirectory with the actual ChromaDB files
196
- chromadb_subdir = os.path.join(temp_download_path, "chromadb")
197
-
198
- if os.path.exists(chromadb_subdir):
199
- logger.info(f"πŸ“ Found ChromaDB subdirectory: {chromadb_subdir}")
200
-
201
- try:
202
- # Move the ChromaDB files from the subdirectory to the target location
203
- import shutil
204
- if os.path.exists(local_path):
205
- logger.info(f"Removing existing target directory: {local_path}")
206
- shutil.rmtree(local_path)
207
-
208
- logger.info(f"Moving ChromaDB from {chromadb_subdir} to {local_path}")
209
- shutil.move(chromadb_subdir, local_path)
210
-
211
- # Clean up the temporary download directory
212
- logger.info(f"Cleaning up temporary directory: {temp_download_path}")
213
- shutil.rmtree(temp_download_path)
214
-
215
- # Clean up HF cache to save disk space
216
- if os.path.exists(cache_dir):
217
- logger.info(f"Cleaning up HF cache directory: {cache_dir}")
218
- try:
219
- shutil.rmtree(cache_dir)
220
- logger.info("βœ… HF cache cleaned up successfully")
221
- except Exception as e:
222
- logger.warning(f"Could not clean up HF cache: {e}")
223
-
224
- logger.info(f"βœ… ChromaDB files moved to: {local_path}")
225
- except Exception as e:
226
- logger.error(f"❌ Error moving ChromaDB files: {e}")
227
- # Clean up temporary directory on error
228
- try:
229
- shutil.rmtree(temp_download_path)
230
- except:
231
- pass
232
- return False
233
- else:
234
- logger.error(f"❌ ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
235
- # List what we actually downloaded for debugging
236
- if os.path.exists(temp_download_path):
237
- logger.info(f"Contents of {temp_download_path}:")
238
- for item in os.listdir(temp_download_path):
239
- logger.info(f" {item}")
240
- # Clean up temporary directory
241
- try:
242
- import shutil
243
- shutil.rmtree(temp_download_path)
244
- except:
245
- pass
246
- return False
247
-
248
- # Verify the final ChromaDB structure
249
- if os.path.exists(local_path) and os.listdir(local_path):
250
- file_count = sum(len(files) for _, _, files in os.walk(local_path))
251
- total_size = sum(
252
- os.path.getsize(os.path.join(dirpath, filename))
253
- for dirpath, _, filenames in os.walk(local_path)
254
- for filename in filenames
255
- ) / (1024 * 1024 * 1024) # Convert to GB
256
-
257
- logger.info(f"πŸ“Š ChromaDB verification:")
258
- logger.info(f" Files: {file_count}")
259
- logger.info(f" Total size: {total_size:.2f} GB")
260
-
261
- # Check for key ChromaDB files
262
- sqlite_file = os.path.join(local_path, "chroma.sqlite3")
263
- if os.path.exists(sqlite_file):
264
- sqlite_size = os.path.getsize(sqlite_file) / (1024 * 1024 * 1024)
265
- logger.info(f" SQLite database: {sqlite_size:.2f} GB")
266
- return True
267
- else:
268
- logger.error("❌ chroma.sqlite3 not found in ChromaDB directory")
269
- return False
270
- else:
271
- logger.error("❌ ChromaDB directory is empty after processing")
272
- return False
273
-
274
- except ImportError:
275
- logger.error("❌ huggingface_hub not available. Please install: pip install huggingface_hub")
276
- return False
277
- except Exception as e:
278
- logger.error(f"❌ Error downloading from HF Dataset: {e}")
279
- logger.error(f"Exception type: {type(e).__name__}")
280
- return False
281
 
282
- def inspect_chromadb(chroma_path: str):
 
283
  """Inspect ChromaDB collection to verify it's working correctly"""
284
  logger.info("πŸ” === ChromaDB Collection Inspection ===")
285
 
@@ -291,11 +70,6 @@ def inspect_chromadb(chroma_path: str):
291
  collection_name = chroma_config["collection_name"]
292
 
293
  logger.info(f"πŸ“‹ Collection name: {collection_name}")
294
- logger.info(f"πŸ“ ChromaDB path: {chroma_path}")
295
-
296
- # Create ChromaDB client
297
- client = PersistentClient(path=chroma_path)
298
- logger.info("βœ… ChromaDB client created successfully")
299
 
300
  # List all collections
301
  collections = client.list_collections()
@@ -354,6 +128,4 @@ def inspect_chromadb(chroma_path: str):
354
 
355
  logger.info("πŸ” === ChromaDB Inspection Complete ===")
356
 
357
- # Embedding model configuration
358
- EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
359
- EMBEDDING_DEVICE = "cpu"
 
3
  import os
4
  import logging
5
  import streamlit as st
6
+ import chromadb
7
+ from chromadb.config import Settings, APIVersion
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
14
  "page_title": "LegisQA"
15
  }
16
 
17
+ # Embedding model configuration
18
+ EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
19
+ EMBEDDING_DEVICE = "cpu"
20
+
21
  def get_secret(key: str, default=None):
22
  """Get secret from Streamlit secrets or environment variables"""
23
  try:
 
39
  def get_chroma_config():
40
  """Get ChromaDB configuration from environment variables"""
41
  return {
 
42
  "collection_name": os.getenv("CHROMA_COLLECTION_NAME", "usc")
43
  }
44
 
45
+ def create_chroma_client():
46
+ """Create and return a ChromaDB client"""
47
+ host = get_secret("CHROMA_PROXY_BASE")
48
+ token = get_secret("CHROMA_AUTH_TOKEN")
49
+ if not host or not token:
50
+ raise ValueError("Set CHROMA_PROXY_BASE and CHROMA_AUTH_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ headers = {"Authorization": f"Bearer {token}"}
53
+ settings = Settings(
54
+ chroma_server_api_default_path=APIVersion.V2, anonymized_telemetry=False
55
+ )
56
+ client = chromadb.HttpClient(host=host, headers=headers, settings=settings)
57
+
58
+ return client
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+
61
+ def inspect_chromadb(client: chromadb.HttpClient):
62
  """Inspect ChromaDB collection to verify it's working correctly"""
63
  logger.info("πŸ” === ChromaDB Collection Inspection ===")
64
 
 
70
  collection_name = chroma_config["collection_name"]
71
 
72
  logger.info(f"πŸ“‹ Collection name: {collection_name}")
 
 
 
 
 
73
 
74
  # List all collections
75
  collections = client.list_collections()
 
128
 
129
  logger.info("πŸ” === ChromaDB Inspection Complete ===")
130
 
131
+
 
 
src/legisqa_local/core/embeddings.py CHANGED
@@ -1,14 +1,12 @@
1
  """Embedding functionality for LegisQA"""
2
 
3
- from langchain_huggingface import HuggingFaceEmbeddings
4
  from legisqa_local.config.settings import EMBEDDING_MODEL, EMBEDDING_DEVICE
5
 
6
 
7
  def load_embeddings():
8
  """Load and return the embedding function"""
9
- model_kwargs = {"device": EMBEDDING_DEVICE}
10
- emb_fn = HuggingFaceEmbeddings(
11
- model_name=EMBEDDING_MODEL,
12
- model_kwargs=model_kwargs,
13
  )
14
  return emb_fn
 
1
  """Embedding functionality for LegisQA"""
2
 
3
+ from chromadb.utils import embedding_functions
4
  from legisqa_local.config.settings import EMBEDDING_MODEL, EMBEDDING_DEVICE
5
 
6
 
7
  def load_embeddings():
8
  """Load and return the embedding function"""
9
+ emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
10
+ model_name=EMBEDDING_MODEL
 
 
11
  )
12
  return emb_fn
src/legisqa_local/core/vectorstore.py CHANGED
@@ -5,7 +5,7 @@ import os
5
  import streamlit as st
6
  from langchain_chroma import Chroma
7
  from legisqa_local.core.embeddings import load_embeddings
8
- from legisqa_local.config.settings import get_chroma_config
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -16,10 +16,12 @@ def load_vectorstore():
16
  config = get_chroma_config()
17
  emb_fn = load_embeddings()
18
 
 
19
  vectorstore = Chroma(
20
- persist_directory=config["persist_directory"],
 
21
  collection_name=config["collection_name"],
22
- embedding_function=emb_fn,
23
  )
24
 
25
  logger.info("βœ… Vectorstore loaded successfully")
 
5
  import streamlit as st
6
  from langchain_chroma import Chroma
7
  from legisqa_local.core.embeddings import load_embeddings
8
+ from legisqa_local.config.settings import get_chroma_config, create_chroma_client
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
16
  config = get_chroma_config()
17
  emb_fn = load_embeddings()
18
 
19
+ client = create_chroma_client()
20
  vectorstore = Chroma(
21
+ client=client,
22
+ # persist_directory=config["persist_directory"],
23
  collection_name=config["collection_name"],
24
+ # embedding_function=emb_fn,
25
  )
26
 
27
  logger.info("βœ… Vectorstore loaded successfully")