Spaces:
Sleeping
Sleeping
Commit
Β·
6d6af96
1
Parent(s):
8b9c625
update
Browse files
src/legisqa_local/config/settings.py
CHANGED
|
@@ -39,6 +39,13 @@ def get_chroma_config():
|
|
| 39 |
|
| 40 |
def setup_chromadb():
|
| 41 |
"""Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
logger.info("=== ChromaDB Setup Starting ===")
|
| 43 |
|
| 44 |
chroma_config = get_chroma_config()
|
|
@@ -76,6 +83,9 @@ def setup_chromadb():
|
|
| 76 |
logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
|
| 77 |
# Inspect the existing ChromaDB
|
| 78 |
inspect_chromadb(persistent_chroma_path)
|
|
|
|
|
|
|
|
|
|
| 79 |
return persistent_chroma_path
|
| 80 |
|
| 81 |
# ChromaDB not found in persistent storage, try to download from HF Dataset
|
|
@@ -96,6 +106,9 @@ def setup_chromadb():
|
|
| 96 |
logger.info(f"β
ChromaDB download successful! Updated path to: {persistent_chroma_path}")
|
| 97 |
# Inspect the downloaded ChromaDB
|
| 98 |
inspect_chromadb(persistent_chroma_path)
|
|
|
|
|
|
|
|
|
|
| 99 |
return persistent_chroma_path
|
| 100 |
else:
|
| 101 |
logger.error("β ChromaDB download from HF Dataset failed!")
|
|
@@ -115,6 +128,9 @@ def setup_chromadb():
|
|
| 115 |
logger.info(f"β
ChromaDB found at {chroma_path}")
|
| 116 |
# Inspect the fallback ChromaDB
|
| 117 |
inspect_chromadb(chroma_path)
|
|
|
|
|
|
|
|
|
|
| 118 |
return chroma_path
|
| 119 |
else:
|
| 120 |
logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
|
|
@@ -123,6 +139,9 @@ def setup_chromadb():
|
|
| 123 |
|
| 124 |
logger.warning(f"β οΈ Using default ChromaDB path: {chroma_path}")
|
| 125 |
logger.info("=== ChromaDB Setup Complete ===")
|
|
|
|
|
|
|
|
|
|
| 126 |
return chroma_path
|
| 127 |
|
| 128 |
def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
|
|
@@ -141,8 +160,9 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 141 |
logger.info(f"Downloading dataset: {dataset_repo}")
|
| 142 |
logger.info("This may take several minutes for large datasets...")
|
| 143 |
|
| 144 |
-
# Download to a temporary location first
|
| 145 |
-
|
|
|
|
| 146 |
|
| 147 |
downloaded_path = snapshot_download(
|
| 148 |
repo_id=dataset_repo,
|
|
@@ -161,18 +181,42 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 161 |
if os.path.exists(chromadb_subdir):
|
| 162 |
logger.info(f"π Found ChromaDB subdirectory: {chromadb_subdir}")
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
else:
|
| 175 |
logger.error(f"β ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
return False
|
| 177 |
|
| 178 |
# Verify the final ChromaDB structure
|
|
|
|
| 39 |
|
| 40 |
def setup_chromadb():
|
| 41 |
"""Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
|
| 42 |
+
# Use session state to prevent duplicate runs in Streamlit
|
| 43 |
+
import streamlit as st
|
| 44 |
+
if hasattr(st, 'session_state') and getattr(st.session_state, 'chromadb_setup_complete', False):
|
| 45 |
+
logger.info("ChromaDB setup already completed in this session")
|
| 46 |
+
chroma_config = get_chroma_config()
|
| 47 |
+
return chroma_config["persist_directory"]
|
| 48 |
+
|
| 49 |
logger.info("=== ChromaDB Setup Starting ===")
|
| 50 |
|
| 51 |
chroma_config = get_chroma_config()
|
|
|
|
| 83 |
logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
|
| 84 |
# Inspect the existing ChromaDB
|
| 85 |
inspect_chromadb(persistent_chroma_path)
|
| 86 |
+
# Mark setup as complete
|
| 87 |
+
if hasattr(st, 'session_state'):
|
| 88 |
+
st.session_state.chromadb_setup_complete = True
|
| 89 |
return persistent_chroma_path
|
| 90 |
|
| 91 |
# ChromaDB not found in persistent storage, try to download from HF Dataset
|
|
|
|
| 106 |
logger.info(f"β
ChromaDB download successful! Updated path to: {persistent_chroma_path}")
|
| 107 |
# Inspect the downloaded ChromaDB
|
| 108 |
inspect_chromadb(persistent_chroma_path)
|
| 109 |
+
# Mark setup as complete
|
| 110 |
+
if hasattr(st, 'session_state'):
|
| 111 |
+
st.session_state.chromadb_setup_complete = True
|
| 112 |
return persistent_chroma_path
|
| 113 |
else:
|
| 114 |
logger.error("β ChromaDB download from HF Dataset failed!")
|
|
|
|
| 128 |
logger.info(f"β
ChromaDB found at {chroma_path}")
|
| 129 |
# Inspect the fallback ChromaDB
|
| 130 |
inspect_chromadb(chroma_path)
|
| 131 |
+
# Mark setup as complete
|
| 132 |
+
if hasattr(st, 'session_state'):
|
| 133 |
+
st.session_state.chromadb_setup_complete = True
|
| 134 |
return chroma_path
|
| 135 |
else:
|
| 136 |
logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
|
|
|
|
| 139 |
|
| 140 |
logger.warning(f"β οΈ Using default ChromaDB path: {chroma_path}")
|
| 141 |
logger.info("=== ChromaDB Setup Complete ===")
|
| 142 |
+
# Mark setup as complete even if no ChromaDB found (prevents infinite retries)
|
| 143 |
+
if hasattr(st, 'session_state'):
|
| 144 |
+
st.session_state.chromadb_setup_complete = True
|
| 145 |
return chroma_path
|
| 146 |
|
| 147 |
def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
|
|
|
|
| 160 |
logger.info(f"Downloading dataset: {dataset_repo}")
|
| 161 |
logger.info("This may take several minutes for large datasets...")
|
| 162 |
|
| 163 |
+
# Download to a unique temporary location first to avoid conflicts
|
| 164 |
+
import uuid
|
| 165 |
+
temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
|
| 166 |
|
| 167 |
downloaded_path = snapshot_download(
|
| 168 |
repo_id=dataset_repo,
|
|
|
|
| 181 |
if os.path.exists(chromadb_subdir):
|
| 182 |
logger.info(f"π Found ChromaDB subdirectory: {chromadb_subdir}")
|
| 183 |
|
| 184 |
+
try:
|
| 185 |
+
# Move the ChromaDB files from the subdirectory to the target location
|
| 186 |
+
import shutil
|
| 187 |
+
if os.path.exists(local_path):
|
| 188 |
+
logger.info(f"Removing existing target directory: {local_path}")
|
| 189 |
+
shutil.rmtree(local_path)
|
| 190 |
+
|
| 191 |
+
logger.info(f"Moving ChromaDB from {chromadb_subdir} to {local_path}")
|
| 192 |
+
shutil.move(chromadb_subdir, local_path)
|
| 193 |
+
|
| 194 |
+
# Clean up the temporary download directory
|
| 195 |
+
logger.info(f"Cleaning up temporary directory: {temp_download_path}")
|
| 196 |
+
shutil.rmtree(temp_download_path)
|
| 197 |
+
|
| 198 |
+
logger.info(f"β
ChromaDB files moved to: {local_path}")
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"β Error moving ChromaDB files: {e}")
|
| 201 |
+
# Clean up temporary directory on error
|
| 202 |
+
try:
|
| 203 |
+
shutil.rmtree(temp_download_path)
|
| 204 |
+
except:
|
| 205 |
+
pass
|
| 206 |
+
return False
|
| 207 |
else:
|
| 208 |
logger.error(f"β ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
|
| 209 |
+
# List what we actually downloaded for debugging
|
| 210 |
+
if os.path.exists(temp_download_path):
|
| 211 |
+
logger.info(f"Contents of {temp_download_path}:")
|
| 212 |
+
for item in os.listdir(temp_download_path):
|
| 213 |
+
logger.info(f" {item}")
|
| 214 |
+
# Clean up temporary directory
|
| 215 |
+
try:
|
| 216 |
+
import shutil
|
| 217 |
+
shutil.rmtree(temp_download_path)
|
| 218 |
+
except:
|
| 219 |
+
pass
|
| 220 |
return False
|
| 221 |
|
| 222 |
# Verify the final ChromaDB structure
|