gabrielaltay commited on
Commit
6d6af96
Β·
1 Parent(s): 8b9c625
Files changed (1) hide show
  1. src/legisqa_local/config/settings.py +56 -12
src/legisqa_local/config/settings.py CHANGED
@@ -39,6 +39,13 @@ def get_chroma_config():
39
 
40
  def setup_chromadb():
41
  """Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
 
 
 
 
 
 
 
42
  logger.info("=== ChromaDB Setup Starting ===")
43
 
44
  chroma_config = get_chroma_config()
@@ -76,6 +83,9 @@ def setup_chromadb():
76
  logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
77
  # Inspect the existing ChromaDB
78
  inspect_chromadb(persistent_chroma_path)
 
 
 
79
  return persistent_chroma_path
80
 
81
  # ChromaDB not found in persistent storage, try to download from HF Dataset
@@ -96,6 +106,9 @@ def setup_chromadb():
96
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
97
  # Inspect the downloaded ChromaDB
98
  inspect_chromadb(persistent_chroma_path)
 
 
 
99
  return persistent_chroma_path
100
  else:
101
  logger.error("❌ ChromaDB download from HF Dataset failed!")
@@ -115,6 +128,9 @@ def setup_chromadb():
115
  logger.info(f"βœ… ChromaDB found at {chroma_path}")
116
  # Inspect the fallback ChromaDB
117
  inspect_chromadb(chroma_path)
 
 
 
118
  return chroma_path
119
  else:
120
  logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
@@ -123,6 +139,9 @@ def setup_chromadb():
123
 
124
  logger.warning(f"⚠️ Using default ChromaDB path: {chroma_path}")
125
  logger.info("=== ChromaDB Setup Complete ===")
 
 
 
126
  return chroma_path
127
 
128
  def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
@@ -141,8 +160,9 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
141
  logger.info(f"Downloading dataset: {dataset_repo}")
142
  logger.info("This may take several minutes for large datasets...")
143
 
144
- # Download to a temporary location first
145
- temp_download_path = f"{local_path}_temp"
 
146
 
147
  downloaded_path = snapshot_download(
148
  repo_id=dataset_repo,
@@ -161,18 +181,42 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
161
  if os.path.exists(chromadb_subdir):
162
  logger.info(f"πŸ“ Found ChromaDB subdirectory: {chromadb_subdir}")
163
 
164
- # Move the ChromaDB files from the subdirectory to the target location
165
- import shutil
166
- if os.path.exists(local_path):
167
- shutil.rmtree(local_path)
168
- shutil.move(chromadb_subdir, local_path)
169
-
170
- # Clean up the temporary download directory
171
- shutil.rmtree(temp_download_path)
172
-
173
- logger.info(f"βœ… ChromaDB files moved to: {local_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  else:
175
  logger.error(f"❌ ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
 
 
 
 
 
 
 
 
 
 
 
176
  return False
177
 
178
  # Verify the final ChromaDB structure
 
39
 
40
  def setup_chromadb():
41
  """Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
42
+ # Use session state to prevent duplicate runs in Streamlit
43
+ import streamlit as st
44
+ if hasattr(st, 'session_state') and getattr(st.session_state, 'chromadb_setup_complete', False):
45
+ logger.info("ChromaDB setup already completed in this session")
46
+ chroma_config = get_chroma_config()
47
+ return chroma_config["persist_directory"]
48
+
49
  logger.info("=== ChromaDB Setup Starting ===")
50
 
51
  chroma_config = get_chroma_config()
 
83
  logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
84
  # Inspect the existing ChromaDB
85
  inspect_chromadb(persistent_chroma_path)
86
+ # Mark setup as complete
87
+ if hasattr(st, 'session_state'):
88
+ st.session_state.chromadb_setup_complete = True
89
  return persistent_chroma_path
90
 
91
  # ChromaDB not found in persistent storage, try to download from HF Dataset
 
106
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
107
  # Inspect the downloaded ChromaDB
108
  inspect_chromadb(persistent_chroma_path)
109
+ # Mark setup as complete
110
+ if hasattr(st, 'session_state'):
111
+ st.session_state.chromadb_setup_complete = True
112
  return persistent_chroma_path
113
  else:
114
  logger.error("❌ ChromaDB download from HF Dataset failed!")
 
128
  logger.info(f"βœ… ChromaDB found at {chroma_path}")
129
  # Inspect the fallback ChromaDB
130
  inspect_chromadb(chroma_path)
131
+ # Mark setup as complete
132
+ if hasattr(st, 'session_state'):
133
+ st.session_state.chromadb_setup_complete = True
134
  return chroma_path
135
  else:
136
  logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
 
139
 
140
  logger.warning(f"⚠️ Using default ChromaDB path: {chroma_path}")
141
  logger.info("=== ChromaDB Setup Complete ===")
142
+ # Mark setup as complete even if no ChromaDB found (prevents infinite retries)
143
+ if hasattr(st, 'session_state'):
144
+ st.session_state.chromadb_setup_complete = True
145
  return chroma_path
146
 
147
  def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
 
160
  logger.info(f"Downloading dataset: {dataset_repo}")
161
  logger.info("This may take several minutes for large datasets...")
162
 
163
+ # Download to a unique temporary location first to avoid conflicts
164
+ import uuid
165
+ temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
166
 
167
  downloaded_path = snapshot_download(
168
  repo_id=dataset_repo,
 
181
  if os.path.exists(chromadb_subdir):
182
  logger.info(f"πŸ“ Found ChromaDB subdirectory: {chromadb_subdir}")
183
 
184
+ try:
185
+ # Move the ChromaDB files from the subdirectory to the target location
186
+ import shutil
187
+ if os.path.exists(local_path):
188
+ logger.info(f"Removing existing target directory: {local_path}")
189
+ shutil.rmtree(local_path)
190
+
191
+ logger.info(f"Moving ChromaDB from {chromadb_subdir} to {local_path}")
192
+ shutil.move(chromadb_subdir, local_path)
193
+
194
+ # Clean up the temporary download directory
195
+ logger.info(f"Cleaning up temporary directory: {temp_download_path}")
196
+ shutil.rmtree(temp_download_path)
197
+
198
+ logger.info(f"βœ… ChromaDB files moved to: {local_path}")
199
+ except Exception as e:
200
+ logger.error(f"❌ Error moving ChromaDB files: {e}")
201
+ # Clean up temporary directory on error
202
+ try:
203
+ shutil.rmtree(temp_download_path)
204
+ except:
205
+ pass
206
+ return False
207
  else:
208
  logger.error(f"❌ ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
209
+ # List what we actually downloaded for debugging
210
+ if os.path.exists(temp_download_path):
211
+ logger.info(f"Contents of {temp_download_path}:")
212
+ for item in os.listdir(temp_download_path):
213
+ logger.info(f" {item}")
214
+ # Clean up temporary directory
215
+ try:
216
+ import shutil
217
+ shutil.rmtree(temp_download_path)
218
+ except:
219
+ pass
220
  return False
221
 
222
  # Verify the final ChromaDB structure