Spaces:
Sleeping
Sleeping
Commit
Β·
024288c
1
Parent(s):
6d6af96
update
Browse files- Dockerfile +8 -3
- src/legisqa_local/config/settings.py +55 -36
Dockerfile
CHANGED
|
@@ -24,12 +24,17 @@ ENV HOME=/home/user \
|
|
| 24 |
# Set the working directory to the user's home directory
|
| 25 |
WORKDIR $HOME/app
|
| 26 |
|
| 27 |
-
# Copy
|
| 28 |
COPY --chown=user pyproject.toml uv.lock ./
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
COPY --chown=user src/ ./src/
|
| 30 |
|
| 31 |
-
# Install
|
| 32 |
-
RUN uv
|
| 33 |
|
| 34 |
# Expose port (default 8501, can be overridden)
|
| 35 |
EXPOSE 8501
|
|
|
|
| 24 |
# Set the working directory to the user's home directory
|
| 25 |
WORKDIR $HOME/app
|
| 26 |
|
| 27 |
+
# Copy dependency files first for better caching
|
| 28 |
COPY --chown=user pyproject.toml uv.lock ./
|
| 29 |
+
|
| 30 |
+
# Install dependencies (this layer will be cached when only code changes)
|
| 31 |
+
RUN uv sync --frozen
|
| 32 |
+
|
| 33 |
+
# Copy source code
|
| 34 |
COPY --chown=user src/ ./src/
|
| 35 |
|
| 36 |
+
# Install the local package (this layer will rebuild when code changes)
|
| 37 |
+
RUN uv pip install -e .
|
| 38 |
|
| 39 |
# Expose port (default 8501, can be overridden)
|
| 40 |
EXPOSE 8501
|
src/legisqa_local/config/settings.py
CHANGED
|
@@ -39,13 +39,24 @@ def get_chroma_config():
|
|
| 39 |
|
| 40 |
def setup_chromadb():
|
| 41 |
"""Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
|
| 42 |
-
# Use
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
chroma_config = get_chroma_config()
|
| 47 |
return chroma_config["persist_directory"]
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
logger.info("=== ChromaDB Setup Starting ===")
|
| 50 |
|
| 51 |
chroma_config = get_chroma_config()
|
|
@@ -64,29 +75,34 @@ def setup_chromadb():
|
|
| 64 |
if os.path.exists("/data"):
|
| 65 |
logger.info("π HF Spaces persistent storage detected at /data")
|
| 66 |
|
| 67 |
-
#
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
if os.path.exists(persistent_chroma_path) and os.listdir(persistent_chroma_path):
|
| 80 |
-
logger.info(f"β
ChromaDB found in persistent storage: {persistent_chroma_path}")
|
| 81 |
-
# Update environment variable to point to persistent storage
|
| 82 |
-
os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
|
| 83 |
-
logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
|
| 84 |
-
# Inspect the existing ChromaDB
|
| 85 |
-
inspect_chromadb(persistent_chroma_path)
|
| 86 |
-
# Mark setup as complete
|
| 87 |
-
if hasattr(st, 'session_state'):
|
| 88 |
-
st.session_state.chromadb_setup_complete = True
|
| 89 |
-
return persistent_chroma_path
|
| 90 |
|
| 91 |
# ChromaDB not found in persistent storage, try to download from HF Dataset
|
| 92 |
logger.info("ChromaDB not found in persistent storage, checking HF Dataset configuration...")
|
|
@@ -106,9 +122,6 @@ def setup_chromadb():
|
|
| 106 |
logger.info(f"β
ChromaDB download successful! Updated path to: {persistent_chroma_path}")
|
| 107 |
# Inspect the downloaded ChromaDB
|
| 108 |
inspect_chromadb(persistent_chroma_path)
|
| 109 |
-
# Mark setup as complete
|
| 110 |
-
if hasattr(st, 'session_state'):
|
| 111 |
-
st.session_state.chromadb_setup_complete = True
|
| 112 |
return persistent_chroma_path
|
| 113 |
else:
|
| 114 |
logger.error("β ChromaDB download from HF Dataset failed!")
|
|
@@ -128,9 +141,6 @@ def setup_chromadb():
|
|
| 128 |
logger.info(f"β
ChromaDB found at {chroma_path}")
|
| 129 |
# Inspect the fallback ChromaDB
|
| 130 |
inspect_chromadb(chroma_path)
|
| 131 |
-
# Mark setup as complete
|
| 132 |
-
if hasattr(st, 'session_state'):
|
| 133 |
-
st.session_state.chromadb_setup_complete = True
|
| 134 |
return chroma_path
|
| 135 |
else:
|
| 136 |
logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
|
|
@@ -139,9 +149,6 @@ def setup_chromadb():
|
|
| 139 |
|
| 140 |
logger.warning(f"β οΈ Using default ChromaDB path: {chroma_path}")
|
| 141 |
logger.info("=== ChromaDB Setup Complete ===")
|
| 142 |
-
# Mark setup as complete even if no ChromaDB found (prevents infinite retries)
|
| 143 |
-
if hasattr(st, 'session_state'):
|
| 144 |
-
st.session_state.chromadb_setup_complete = True
|
| 145 |
return chroma_path
|
| 146 |
|
| 147 |
def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
|
|
@@ -164,11 +171,14 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 164 |
import uuid
|
| 165 |
temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
|
| 166 |
|
|
|
|
|
|
|
|
|
|
| 167 |
downloaded_path = snapshot_download(
|
| 168 |
repo_id=dataset_repo,
|
| 169 |
repo_type="dataset",
|
| 170 |
local_dir=temp_download_path,
|
| 171 |
-
cache_dir=
|
| 172 |
# Note: resume_download and local_dir_use_symlinks are now handled automatically
|
| 173 |
)
|
| 174 |
|
|
@@ -195,6 +205,15 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 195 |
logger.info(f"Cleaning up temporary directory: {temp_download_path}")
|
| 196 |
shutil.rmtree(temp_download_path)
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
logger.info(f"β
ChromaDB files moved to: {local_path}")
|
| 199 |
except Exception as e:
|
| 200 |
logger.error(f"β Error moving ChromaDB files: {e}")
|
|
|
|
| 39 |
|
| 40 |
def setup_chromadb():
|
| 41 |
"""Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
|
| 42 |
+
# Use a file-based lock to prevent duplicate runs (more reliable than session state in HF Spaces)
|
| 43 |
+
lock_file = "/tmp/chromadb_setup.lock"
|
| 44 |
+
|
| 45 |
+
# Check if setup is already in progress or complete
|
| 46 |
+
if os.path.exists(lock_file):
|
| 47 |
+
logger.info("ChromaDB setup already in progress or complete (lock file exists)")
|
| 48 |
chroma_config = get_chroma_config()
|
| 49 |
return chroma_config["persist_directory"]
|
| 50 |
|
| 51 |
+
# Create lock file
|
| 52 |
+
try:
|
| 53 |
+
with open(lock_file, 'w') as f:
|
| 54 |
+
f.write("ChromaDB setup in progress")
|
| 55 |
+
logger.info("Created ChromaDB setup lock file")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.warning(f"Could not create lock file: {e}")
|
| 58 |
+
# Continue anyway
|
| 59 |
+
|
| 60 |
logger.info("=== ChromaDB Setup Starting ===")
|
| 61 |
|
| 62 |
chroma_config = get_chroma_config()
|
|
|
|
| 75 |
if os.path.exists("/data"):
|
| 76 |
logger.info("π HF Spaces persistent storage detected at /data")
|
| 77 |
|
| 78 |
+
# Always clear /data completely to free up maximum space
|
| 79 |
+
logger.info("π§Ή Clearing entire /data directory to free up space...")
|
| 80 |
+
import shutil
|
| 81 |
+
try:
|
| 82 |
+
# Check initial disk space
|
| 83 |
+
statvfs = os.statvfs("/data")
|
| 84 |
+
free_space_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024 * 1024 * 1024)
|
| 85 |
+
logger.info(f"πΎ Initial free space: {free_space_gb:.2f} GB")
|
| 86 |
+
|
| 87 |
+
for item in os.listdir("/data"):
|
| 88 |
+
item_path = os.path.join("/data", item)
|
| 89 |
+
if os.path.isdir(item_path):
|
| 90 |
+
shutil.rmtree(item_path)
|
| 91 |
+
logger.info(f" Removed directory: {item}")
|
| 92 |
+
else:
|
| 93 |
+
os.remove(item_path)
|
| 94 |
+
logger.info(f" Removed file: {item}")
|
| 95 |
+
|
| 96 |
+
# Check free space after cleanup
|
| 97 |
+
statvfs = os.statvfs("/data")
|
| 98 |
+
free_space_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024 * 1024 * 1024)
|
| 99 |
+
logger.info(f"β
/data directory cleared successfully")
|
| 100 |
+
logger.info(f"πΎ Free space after cleanup: {free_space_gb:.2f} GB")
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"β Error clearing /data directory: {e}")
|
| 103 |
+
logger.info("Continuing with download anyway...")
|
| 104 |
|
| 105 |
+
# ChromaDB will always need to be downloaded fresh now
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# ChromaDB not found in persistent storage, try to download from HF Dataset
|
| 108 |
logger.info("ChromaDB not found in persistent storage, checking HF Dataset configuration...")
|
|
|
|
| 122 |
logger.info(f"β
ChromaDB download successful! Updated path to: {persistent_chroma_path}")
|
| 123 |
# Inspect the downloaded ChromaDB
|
| 124 |
inspect_chromadb(persistent_chroma_path)
|
|
|
|
|
|
|
|
|
|
| 125 |
return persistent_chroma_path
|
| 126 |
else:
|
| 127 |
logger.error("β ChromaDB download from HF Dataset failed!")
|
|
|
|
| 141 |
logger.info(f"β
ChromaDB found at {chroma_path}")
|
| 142 |
# Inspect the fallback ChromaDB
|
| 143 |
inspect_chromadb(chroma_path)
|
|
|
|
|
|
|
|
|
|
| 144 |
return chroma_path
|
| 145 |
else:
|
| 146 |
logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
|
|
|
|
| 149 |
|
| 150 |
logger.warning(f"β οΈ Using default ChromaDB path: {chroma_path}")
|
| 151 |
logger.info("=== ChromaDB Setup Complete ===")
|
|
|
|
|
|
|
|
|
|
| 152 |
return chroma_path
|
| 153 |
|
| 154 |
def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
|
|
|
|
| 171 |
import uuid
|
| 172 |
temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
|
| 173 |
|
| 174 |
+
# Use /tmp for cache (we cleared /data completely)
|
| 175 |
+
cache_dir = "/tmp/hf_chromadb_cache"
|
| 176 |
+
|
| 177 |
downloaded_path = snapshot_download(
|
| 178 |
repo_id=dataset_repo,
|
| 179 |
repo_type="dataset",
|
| 180 |
local_dir=temp_download_path,
|
| 181 |
+
cache_dir=cache_dir
|
| 182 |
# Note: resume_download and local_dir_use_symlinks are now handled automatically
|
| 183 |
)
|
| 184 |
|
|
|
|
| 205 |
logger.info(f"Cleaning up temporary directory: {temp_download_path}")
|
| 206 |
shutil.rmtree(temp_download_path)
|
| 207 |
|
| 208 |
+
# Clean up HF cache to save disk space
|
| 209 |
+
if os.path.exists(cache_dir):
|
| 210 |
+
logger.info(f"Cleaning up HF cache directory: {cache_dir}")
|
| 211 |
+
try:
|
| 212 |
+
shutil.rmtree(cache_dir)
|
| 213 |
+
logger.info("β
HF cache cleaned up successfully")
|
| 214 |
+
except Exception as e:
|
| 215 |
+
logger.warning(f"Could not clean up HF cache: {e}")
|
| 216 |
+
|
| 217 |
logger.info(f"β
ChromaDB files moved to: {local_path}")
|
| 218 |
except Exception as e:
|
| 219 |
logger.error(f"β Error moving ChromaDB files: {e}")
|