gabrielaltay commited on
Commit
c1ea157
·
1 Parent(s): bc2f8ea

pull chroma from s3

Browse files
Dockerfile CHANGED
@@ -6,8 +6,15 @@ RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
8
  git \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
 
 
 
 
 
 
11
  # Install uv
12
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
13
 
 
6
  build-essential \
7
  curl \
8
  git \
9
+ unzip \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Install AWS CLI
13
+ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
14
+ && unzip awscliv2.zip \
15
+ && ./aws/install \
16
+ && rm -rf aws awscliv2.zip
17
+
18
  # Install uv
19
  COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
20
 
pyproject.toml CHANGED
@@ -5,6 +5,7 @@ description = "Congressional Legislation Query and Analysis Tool"
5
  readme = "README.md"
6
  requires-python = ">=3.13"
7
  dependencies = [
 
8
  "chromadb>=1.1.0",
9
  "datasets>=3.0.0",
10
  "langchain>=0.3.27",
 
5
  readme = "README.md"
6
  requires-python = ">=3.13"
7
  dependencies = [
8
+ "boto3>=1.35.0",
9
  "chromadb>=1.1.0",
10
  "datasets>=3.0.0",
11
  "langchain>=0.3.27",
src/legisqa_local/app.py CHANGED
@@ -1,7 +1,7 @@
1
  """Main Streamlit application for LegisQA"""
2
 
3
  import streamlit as st
4
- from legisqa_local.config.settings import STREAMLIT_CONFIG, setup_environment
5
  from legisqa_local.components.sidebar import render_sidebar
6
  from legisqa_local.tabs.rag_tab import RAGTab
7
  from legisqa_local.tabs.rag_sbs_tab import RAGSideBySideTab
@@ -16,6 +16,9 @@ def main():
16
  # Setup environment
17
  setup_environment()
18
 
 
 
 
19
  # Main content
20
  st.title(":classical_building: LegisQA :classical_building:")
21
  st.header("Query Congressional Bills")
 
1
  """Main Streamlit application for LegisQA"""
2
 
3
  import streamlit as st
4
+ from legisqa_local.config.settings import STREAMLIT_CONFIG, setup_environment, setup_chromadb
5
  from legisqa_local.components.sidebar import render_sidebar
6
  from legisqa_local.tabs.rag_tab import RAGTab
7
  from legisqa_local.tabs.rag_sbs_tab import RAGSideBySideTab
 
16
  # Setup environment
17
  setup_environment()
18
 
19
+ # Setup ChromaDB (download if needed)
20
+ setup_chromadb()
21
+
22
  # Main content
23
  st.title(":classical_building: LegisQA :classical_building:")
24
  st.header("Query Congressional Bills")
src/legisqa_local/config/settings.py CHANGED
@@ -34,6 +34,127 @@ def get_chroma_config():
34
  "collection_name": os.getenv("CHROMA_COLLECTION_NAME", "usc")
35
  }
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Embedding model configuration
38
  EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
39
  EMBEDDING_DEVICE = "cpu"
 
34
  "collection_name": os.getenv("CHROMA_COLLECTION_NAME", "usc")
35
  }
36
 
37
+ def setup_chromadb():
38
+ """Setup ChromaDB - use persistent storage (/data) or download from S3 if needed"""
39
+ chroma_config = get_chroma_config()
40
+ chroma_path = chroma_config["persist_directory"]
41
+
42
+ # For HF Spaces with persistent storage, prefer /data directory
43
+ persistent_chroma_path = "/data/chromadb"
44
+ if os.path.exists("/data"):
45
+ print("HF Spaces persistent storage detected at /data")
46
+
47
+ # Check if ChromaDB exists in persistent storage
48
+ if os.path.exists(persistent_chroma_path) and os.listdir(persistent_chroma_path):
49
+ print(f"✅ ChromaDB found in persistent storage: {persistent_chroma_path}")
50
+ # Update environment variable to point to persistent storage
51
+ os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
52
+ return persistent_chroma_path
53
+
54
+ # Download from S3 to persistent storage
55
+ s3_bucket = os.getenv("CHROMA_S3_BUCKET", "")
56
+ s3_prefix = os.getenv("CHROMA_S3_PREFIX", "")
57
+
58
+ if s3_bucket and s3_prefix:
59
+ print(f"📥 Downloading ChromaDB from S3 to persistent storage...")
60
+ print(f" Source: s3://{s3_bucket}/{s3_prefix}")
61
+ print(f" Target: {persistent_chroma_path}")
62
+
63
+ success = download_chromadb_from_s3(s3_bucket, s3_prefix, persistent_chroma_path)
64
+ if success:
65
+ # Update environment variable to point to persistent storage
66
+ os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
67
+ return persistent_chroma_path
68
+ else:
69
+ print("❌ No S3 configuration found (CHROMA_S3_BUCKET, CHROMA_S3_PREFIX)")
70
+
71
+ # Fallback: check if ChromaDB exists at configured path (local development)
72
+ if os.path.exists(chroma_path) and os.listdir(chroma_path):
73
+ print(f"✅ ChromaDB found at {chroma_path}")
74
+ return chroma_path
75
+
76
+ print(f"⚠️ Using default ChromaDB path: {chroma_path}")
77
+ return chroma_path
78
+
79
+ def download_chromadb_from_s3(bucket: str, prefix: str, local_path: str) -> bool:
80
+ """Download ChromaDB from S3"""
81
+ try:
82
+ import subprocess
83
+ import os
84
+
85
+ # Ensure target directory exists
86
+ os.makedirs(local_path, exist_ok=True)
87
+
88
+ # Use AWS CLI to sync from S3 (no credentials needed for public buckets)
89
+ s3_url = f"s3://{bucket}/{prefix}"
90
+ cmd = ["aws", "s3", "sync", s3_url, local_path, "--no-sign-request"]
91
+
92
+ print(f"Running: {' '.join(cmd)}")
93
+ result = subprocess.run(cmd, capture_output=True, text=True)
94
+
95
+ if result.returncode == 0:
96
+ print("✅ ChromaDB download from S3 complete!")
97
+ return True
98
+ else:
99
+ print(f"❌ S3 download failed: {result.stderr}")
100
+ return False
101
+
102
+ except FileNotFoundError:
103
+ print("❌ AWS CLI not found. Trying with boto3...")
104
+ return download_chromadb_from_s3_boto3(bucket, prefix, local_path)
105
+ except Exception as e:
106
+ print(f"❌ Error downloading from S3: {e}")
107
+ return False
108
+
109
+ def download_chromadb_from_s3_boto3(bucket: str, prefix: str, local_path: str) -> bool:
110
+ """Download ChromaDB from S3 using boto3 (fallback)"""
111
+ try:
112
+ import boto3
113
+ from botocore import UNSIGNED
114
+ from botocore.config import Config
115
+ import os
116
+
117
+ print("📦 Using boto3 for S3 download...")
118
+
119
+ # Create S3 client with no credentials (for public buckets)
120
+ s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
121
+
122
+ # List objects in the S3 prefix
123
+ paginator = s3.get_paginator('list_objects_v2')
124
+ pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
125
+
126
+ os.makedirs(local_path, exist_ok=True)
127
+
128
+ file_count = 0
129
+ for page in pages:
130
+ if 'Contents' in page:
131
+ for obj in page['Contents']:
132
+ key = obj['Key']
133
+ # Get relative path by removing prefix
134
+ relative_path = key[len(prefix):].lstrip('/')
135
+ if relative_path: # Skip empty paths
136
+ local_file_path = os.path.join(local_path, relative_path)
137
+
138
+ # Create directory if needed
139
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
140
+
141
+ # Download file
142
+ file_count += 1
143
+ if file_count % 10 == 0:
144
+ print(f"Downloaded {file_count} files...")
145
+
146
+ s3.download_file(bucket, key, local_file_path)
147
+
148
+ print(f"✅ ChromaDB download from S3 (boto3) complete! Downloaded {file_count} files.")
149
+ return True
150
+
151
+ except ImportError:
152
+ print("❌ boto3 not available. Please install: pip install boto3")
153
+ return False
154
+ except Exception as e:
155
+ print(f"❌ Error downloading from S3 with boto3: {e}")
156
+ return False
157
+
158
  # Embedding model configuration
159
  EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
160
  EMBEDDING_DEVICE = "cpu"
uv.lock CHANGED
@@ -191,6 +191,34 @@ wheels = [
191
  { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
192
  ]
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  [[package]]
195
  name = "build"
196
  version = "1.3.0"
@@ -817,6 +845,15 @@ wheels = [
817
  { url = "https://files.pythonhosted.org/packages/af/22/7ab7b4ec3a1c1f03aef376af11d23b05abcca3fb31fbca1e7557053b1ba2/jiter-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2bbf24f16ba5ad4441a9845e40e4ea0cb9eed00e76ba94050664ef53ef4406", size = 347102, upload-time = "2025-09-15T09:20:20.16Z" },
818
  ]
819
 
 
 
 
 
 
 
 
 
 
820
  [[package]]
821
  name = "joblib"
822
  version = "1.5.2"
@@ -1076,6 +1113,7 @@ name = "legisqa-local"
1076
  version = "0.1.0"
1077
  source = { editable = "." }
1078
  dependencies = [
 
1079
  { name = "chromadb" },
1080
  { name = "datasets" },
1081
  { name = "langchain" },
@@ -1094,6 +1132,7 @@ dependencies = [
1094
 
1095
  [package.metadata]
1096
  requires-dist = [
 
1097
  { name = "chromadb", specifier = ">=1.1.0" },
1098
  { name = "datasets", specifier = ">=3.0.0" },
1099
  { name = "langchain", specifier = ">=0.3.27" },
@@ -2351,6 +2390,18 @@ wheels = [
2351
  { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
2352
  ]
2353
 
 
 
 
 
 
 
 
 
 
 
 
 
2354
  [[package]]
2355
  name = "safetensors"
2356
  version = "0.6.2"
 
191
  { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
192
  ]
193
 
194
+ [[package]]
195
+ name = "boto3"
196
+ version = "1.40.40"
197
+ source = { registry = "https://pypi.org/simple" }
198
+ dependencies = [
199
+ { name = "botocore" },
200
+ { name = "jmespath" },
201
+ { name = "s3transfer" },
202
+ ]
203
+ sdist = { url = "https://files.pythonhosted.org/packages/3c/12/1a31b36802d0f33bc6982ab8b7e6437d75ef3c179abe6c53d4d8f7b4248f/boto3-1.40.40.tar.gz", hash = "sha256:f384d3a0410d0f1a4d4ae7aa69c41d0549c6ca5a76667dc25fc97d50ad6db740", size = 111606, upload-time = "2025-09-26T19:23:46.923Z" }
204
+ wheels = [
205
+ { url = "https://files.pythonhosted.org/packages/90/69/c65566dbdaaea3af0c23f7731ab0f185a38b593fd449d2423374150dbfe0/boto3-1.40.40-py3-none-any.whl", hash = "sha256:385904de68623e1c341bdc095d94a30006843032c912adeb1e0752a343632ec6", size = 139340, upload-time = "2025-09-26T19:23:45.557Z" },
206
+ ]
207
+
208
+ [[package]]
209
+ name = "botocore"
210
+ version = "1.40.40"
211
+ source = { registry = "https://pypi.org/simple" }
212
+ dependencies = [
213
+ { name = "jmespath" },
214
+ { name = "python-dateutil" },
215
+ { name = "urllib3" },
216
+ ]
217
+ sdist = { url = "https://files.pythonhosted.org/packages/83/5a/43a7fea503ad14fa79819f2b3103a38977fb587a3663d1ac6e958fccf592/botocore-1.40.40.tar.gz", hash = "sha256:78eb121a16a6481ed0f6e1aebe53a4f23aa121f34466846c13a5ca48fa980e31", size = 14363370, upload-time = "2025-09-26T19:23:37.853Z" }
218
+ wheels = [
219
+ { url = "https://files.pythonhosted.org/packages/ed/5e/3bbf6d34cbf307c1b9e58e0204ceba2d35bbc0c93b4e3b3cc895aae0a5fd/botocore-1.40.40-py3-none-any.whl", hash = "sha256:68506142b3cde93145ef3ee0268f2444f2b68ada225a151f714092bbd3d6516a", size = 14031738, upload-time = "2025-09-26T19:23:35.475Z" },
220
+ ]
221
+
222
  [[package]]
223
  name = "build"
224
  version = "1.3.0"
 
845
  { url = "https://files.pythonhosted.org/packages/af/22/7ab7b4ec3a1c1f03aef376af11d23b05abcca3fb31fbca1e7557053b1ba2/jiter-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2bbf24f16ba5ad4441a9845e40e4ea0cb9eed00e76ba94050664ef53ef4406", size = 347102, upload-time = "2025-09-15T09:20:20.16Z" },
846
  ]
847
 
848
+ [[package]]
849
+ name = "jmespath"
850
+ version = "1.0.1"
851
+ source = { registry = "https://pypi.org/simple" }
852
+ sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
853
+ wheels = [
854
+ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
855
+ ]
856
+
857
  [[package]]
858
  name = "joblib"
859
  version = "1.5.2"
 
1113
  version = "0.1.0"
1114
  source = { editable = "." }
1115
  dependencies = [
1116
+ { name = "boto3" },
1117
  { name = "chromadb" },
1118
  { name = "datasets" },
1119
  { name = "langchain" },
 
1132
 
1133
  [package.metadata]
1134
  requires-dist = [
1135
+ { name = "boto3", specifier = ">=1.35.0" },
1136
  { name = "chromadb", specifier = ">=1.1.0" },
1137
  { name = "datasets", specifier = ">=3.0.0" },
1138
  { name = "langchain", specifier = ">=0.3.27" },
 
2390
  { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
2391
  ]
2392
 
2393
+ [[package]]
2394
+ name = "s3transfer"
2395
+ version = "0.14.0"
2396
+ source = { registry = "https://pypi.org/simple" }
2397
+ dependencies = [
2398
+ { name = "botocore" },
2399
+ ]
2400
+ sdist = { url = "https://files.pythonhosted.org/packages/62/74/8d69dcb7a9efe8baa2046891735e5dfe433ad558ae23d9e3c14c633d1d58/s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125", size = 151547, upload-time = "2025-09-09T19:23:31.089Z" }
2401
+ wheels = [
2402
+ { url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" },
2403
+ ]
2404
+
2405
  [[package]]
2406
  name = "safetensors"
2407
  version = "0.6.2"