gabrielaltay commited on
Commit
024288c
Β·
1 Parent(s): 6d6af96
Files changed (2) hide show
  1. Dockerfile +8 -3
  2. src/legisqa_local/config/settings.py +55 -36
Dockerfile CHANGED
@@ -24,12 +24,17 @@ ENV HOME=/home/user \
24
  # Set the working directory to the user's home directory
25
  WORKDIR $HOME/app
26
 
27
- # Copy project files with proper ownership
28
  COPY --chown=user pyproject.toml uv.lock ./
 
 
 
 
 
29
  COPY --chown=user src/ ./src/
30
 
31
- # Install dependencies and the package
32
- RUN uv sync --frozen && uv pip install -e .
33
 
34
  # Expose port (default 8501, can be overridden)
35
  EXPOSE 8501
 
24
  # Set the working directory to the user's home directory
25
  WORKDIR $HOME/app
26
 
27
+ # Copy dependency files first for better caching
28
  COPY --chown=user pyproject.toml uv.lock ./
29
+
30
+ # Install dependencies (this layer will be cached when only code changes)
31
+ RUN uv sync --frozen
32
+
33
+ # Copy source code
34
  COPY --chown=user src/ ./src/
35
 
36
+ # Install the local package (this layer will rebuild when code changes)
37
+ RUN uv pip install -e .
38
 
39
  # Expose port (default 8501, can be overridden)
40
  EXPOSE 8501
src/legisqa_local/config/settings.py CHANGED
@@ -39,13 +39,24 @@ def get_chroma_config():
39
 
40
  def setup_chromadb():
41
  """Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
42
- # Use session state to prevent duplicate runs in Streamlit
43
- import streamlit as st
44
- if hasattr(st, 'session_state') and getattr(st.session_state, 'chromadb_setup_complete', False):
45
- logger.info("ChromaDB setup already completed in this session")
 
 
46
  chroma_config = get_chroma_config()
47
  return chroma_config["persist_directory"]
48
 
 
 
 
 
 
 
 
 
 
49
  logger.info("=== ChromaDB Setup Starting ===")
50
 
51
  chroma_config = get_chroma_config()
@@ -64,29 +75,34 @@ def setup_chromadb():
64
  if os.path.exists("/data"):
65
  logger.info("πŸš€ HF Spaces persistent storage detected at /data")
66
 
67
- # Handle force refresh - clear persistent storage
68
- if force_refresh and os.path.exists(persistent_chroma_path):
69
- logger.info(f"πŸ—‘οΈ Clearing existing ChromaDB due to CHROMA_FORCE_REFRESH: {persistent_chroma_path}")
70
- import shutil
71
- try:
72
- shutil.rmtree(persistent_chroma_path)
73
- logger.info("βœ… Existing ChromaDB cleared successfully")
74
- except Exception as e:
75
- logger.error(f"❌ Error clearing ChromaDB: {e}")
76
- logger.info("Continuing with download anyway...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Check if ChromaDB exists in persistent storage (after potential clearing)
79
- if os.path.exists(persistent_chroma_path) and os.listdir(persistent_chroma_path):
80
- logger.info(f"βœ… ChromaDB found in persistent storage: {persistent_chroma_path}")
81
- # Update environment variable to point to persistent storage
82
- os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
83
- logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
84
- # Inspect the existing ChromaDB
85
- inspect_chromadb(persistent_chroma_path)
86
- # Mark setup as complete
87
- if hasattr(st, 'session_state'):
88
- st.session_state.chromadb_setup_complete = True
89
- return persistent_chroma_path
90
 
91
  # ChromaDB not found in persistent storage, try to download from HF Dataset
92
  logger.info("ChromaDB not found in persistent storage, checking HF Dataset configuration...")
@@ -106,9 +122,6 @@ def setup_chromadb():
106
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
107
  # Inspect the downloaded ChromaDB
108
  inspect_chromadb(persistent_chroma_path)
109
- # Mark setup as complete
110
- if hasattr(st, 'session_state'):
111
- st.session_state.chromadb_setup_complete = True
112
  return persistent_chroma_path
113
  else:
114
  logger.error("❌ ChromaDB download from HF Dataset failed!")
@@ -128,9 +141,6 @@ def setup_chromadb():
128
  logger.info(f"βœ… ChromaDB found at {chroma_path}")
129
  # Inspect the fallback ChromaDB
130
  inspect_chromadb(chroma_path)
131
- # Mark setup as complete
132
- if hasattr(st, 'session_state'):
133
- st.session_state.chromadb_setup_complete = True
134
  return chroma_path
135
  else:
136
  logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
@@ -139,9 +149,6 @@ def setup_chromadb():
139
 
140
  logger.warning(f"⚠️ Using default ChromaDB path: {chroma_path}")
141
  logger.info("=== ChromaDB Setup Complete ===")
142
- # Mark setup as complete even if no ChromaDB found (prevents infinite retries)
143
- if hasattr(st, 'session_state'):
144
- st.session_state.chromadb_setup_complete = True
145
  return chroma_path
146
 
147
  def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
@@ -164,11 +171,14 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
164
  import uuid
165
  temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
166
 
 
 
 
167
  downloaded_path = snapshot_download(
168
  repo_id=dataset_repo,
169
  repo_type="dataset",
170
  local_dir=temp_download_path,
171
- cache_dir="/tmp/hf_chromadb_cache"
172
  # Note: resume_download and local_dir_use_symlinks are now handled automatically
173
  )
174
 
@@ -195,6 +205,15 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
195
  logger.info(f"Cleaning up temporary directory: {temp_download_path}")
196
  shutil.rmtree(temp_download_path)
197
 
 
 
 
 
 
 
 
 
 
198
  logger.info(f"βœ… ChromaDB files moved to: {local_path}")
199
  except Exception as e:
200
  logger.error(f"❌ Error moving ChromaDB files: {e}")
 
39
 
40
  def setup_chromadb():
41
  """Setup ChromaDB - use persistent storage (/data) or download from HF Dataset if needed"""
42
+ # Use a file-based lock to prevent duplicate runs (more reliable than session state in HF Spaces)
43
+ lock_file = "/tmp/chromadb_setup.lock"
44
+
45
+ # Check if setup is already in progress or complete
46
+ if os.path.exists(lock_file):
47
+ logger.info("ChromaDB setup already in progress or complete (lock file exists)")
48
  chroma_config = get_chroma_config()
49
  return chroma_config["persist_directory"]
50
 
51
+ # Create lock file
52
+ try:
53
+ with open(lock_file, 'w') as f:
54
+ f.write("ChromaDB setup in progress")
55
+ logger.info("Created ChromaDB setup lock file")
56
+ except Exception as e:
57
+ logger.warning(f"Could not create lock file: {e}")
58
+ # Continue anyway
59
+
60
  logger.info("=== ChromaDB Setup Starting ===")
61
 
62
  chroma_config = get_chroma_config()
 
75
  if os.path.exists("/data"):
76
  logger.info("πŸš€ HF Spaces persistent storage detected at /data")
77
 
78
+ # Always clear /data completely to free up maximum space
79
+ logger.info("🧹 Clearing entire /data directory to free up space...")
80
+ import shutil
81
+ try:
82
+ # Check initial disk space
83
+ statvfs = os.statvfs("/data")
84
+ free_space_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024 * 1024 * 1024)
85
+ logger.info(f"πŸ’Ύ Initial free space: {free_space_gb:.2f} GB")
86
+
87
+ for item in os.listdir("/data"):
88
+ item_path = os.path.join("/data", item)
89
+ if os.path.isdir(item_path):
90
+ shutil.rmtree(item_path)
91
+ logger.info(f" Removed directory: {item}")
92
+ else:
93
+ os.remove(item_path)
94
+ logger.info(f" Removed file: {item}")
95
+
96
+ # Check free space after cleanup
97
+ statvfs = os.statvfs("/data")
98
+ free_space_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024 * 1024 * 1024)
99
+ logger.info(f"βœ… /data directory cleared successfully")
100
+ logger.info(f"πŸ’Ύ Free space after cleanup: {free_space_gb:.2f} GB")
101
+ except Exception as e:
102
+ logger.error(f"❌ Error clearing /data directory: {e}")
103
+ logger.info("Continuing with download anyway...")
104
 
105
+ # ChromaDB will always need to be downloaded fresh now
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # ChromaDB not found in persistent storage, try to download from HF Dataset
108
  logger.info("ChromaDB not found in persistent storage, checking HF Dataset configuration...")
 
122
  logger.info(f"βœ… ChromaDB download successful! Updated path to: {persistent_chroma_path}")
123
  # Inspect the downloaded ChromaDB
124
  inspect_chromadb(persistent_chroma_path)
 
 
 
125
  return persistent_chroma_path
126
  else:
127
  logger.error("❌ ChromaDB download from HF Dataset failed!")
 
141
  logger.info(f"βœ… ChromaDB found at {chroma_path}")
142
  # Inspect the fallback ChromaDB
143
  inspect_chromadb(chroma_path)
 
 
 
144
  return chroma_path
145
  else:
146
  logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
 
149
 
150
  logger.warning(f"⚠️ Using default ChromaDB path: {chroma_path}")
151
  logger.info("=== ChromaDB Setup Complete ===")
 
 
 
152
  return chroma_path
153
 
154
  def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> bool:
 
171
  import uuid
172
  temp_download_path = f"{local_path}_temp_{uuid.uuid4().hex[:8]}"
173
 
174
+ # Use /tmp for cache (we cleared /data completely)
175
+ cache_dir = "/tmp/hf_chromadb_cache"
176
+
177
  downloaded_path = snapshot_download(
178
  repo_id=dataset_repo,
179
  repo_type="dataset",
180
  local_dir=temp_download_path,
181
+ cache_dir=cache_dir
182
  # Note: resume_download and local_dir_use_symlinks are now handled automatically
183
  )
184
 
 
205
  logger.info(f"Cleaning up temporary directory: {temp_download_path}")
206
  shutil.rmtree(temp_download_path)
207
 
208
+ # Clean up HF cache to save disk space
209
+ if os.path.exists(cache_dir):
210
+ logger.info(f"Cleaning up HF cache directory: {cache_dir}")
211
+ try:
212
+ shutil.rmtree(cache_dir)
213
+ logger.info("βœ… HF cache cleaned up successfully")
214
+ except Exception as e:
215
+ logger.warning(f"Could not clean up HF cache: {e}")
216
+
217
  logger.info(f"βœ… ChromaDB files moved to: {local_path}")
218
  except Exception as e:
219
  logger.error(f"❌ Error moving ChromaDB files: {e}")