Spaces:
Sleeping
Sleeping
Commit
·
8b9c625
1
Parent(s):
6f2f718
chroma pathing
Browse files- src/legisqa_local/config/settings.py +41 -9
- uv.lock +2 -51
src/legisqa_local/config/settings.py
CHANGED
|
@@ -141,10 +141,13 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 141 |
logger.info(f"Downloading dataset: {dataset_repo}")
|
| 142 |
logger.info("This may take several minutes for large datasets...")
|
| 143 |
|
|
|
|
|
|
|
|
|
|
| 144 |
downloaded_path = snapshot_download(
|
| 145 |
repo_id=dataset_repo,
|
| 146 |
repo_type="dataset",
|
| 147 |
-
local_dir=
|
| 148 |
cache_dir="/tmp/hf_chromadb_cache"
|
| 149 |
# Note: resume_download and local_dir_use_symlinks are now handled automatically
|
| 150 |
)
|
|
@@ -152,7 +155,27 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 152 |
logger.info(f"✅ ChromaDB download from HF Dataset complete!")
|
| 153 |
logger.info(f"Downloaded to: {downloaded_path}")
|
| 154 |
|
| 155 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
if os.path.exists(local_path) and os.listdir(local_path):
|
| 157 |
file_count = sum(len(files) for _, _, files in os.walk(local_path))
|
| 158 |
total_size = sum(
|
|
@@ -161,13 +184,21 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 161 |
for filename in filenames
|
| 162 |
) / (1024 * 1024 * 1024) # Convert to GB
|
| 163 |
|
| 164 |
-
logger.info(f"📊
|
| 165 |
logger.info(f" Files: {file_count}")
|
| 166 |
logger.info(f" Total size: {total_size:.2f} GB")
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
else:
|
| 170 |
-
logger.error("❌
|
| 171 |
return False
|
| 172 |
|
| 173 |
except ImportError:
|
|
@@ -212,15 +243,16 @@ def inspect_chromadb(chroma_path: str):
|
|
| 212 |
if count > 0:
|
| 213 |
# Get a sample item
|
| 214 |
logger.info("🔍 Fetching sample items...")
|
| 215 |
-
sample = collection.get(limit=3, include=["documents", "metadatas"
|
| 216 |
|
| 217 |
-
|
|
|
|
| 218 |
|
| 219 |
-
if sample
|
| 220 |
logger.info(f"📄 Sample document (first 200 chars):")
|
| 221 |
logger.info(f" {sample['documents'][0][:200]}...")
|
| 222 |
|
| 223 |
-
if sample
|
| 224 |
logger.info(f"🏷️ Sample metadata:")
|
| 225 |
for i, metadata in enumerate(sample['metadatas'][:2]):
|
| 226 |
logger.info(f" Item {i}: {metadata}")
|
|
|
|
| 141 |
logger.info(f"Downloading dataset: {dataset_repo}")
|
| 142 |
logger.info("This may take several minutes for large datasets...")
|
| 143 |
|
| 144 |
+
# Download to a temporary location first
|
| 145 |
+
temp_download_path = f"{local_path}_temp"
|
| 146 |
+
|
| 147 |
downloaded_path = snapshot_download(
|
| 148 |
repo_id=dataset_repo,
|
| 149 |
repo_type="dataset",
|
| 150 |
+
local_dir=temp_download_path,
|
| 151 |
cache_dir="/tmp/hf_chromadb_cache"
|
| 152 |
# Note: resume_download and local_dir_use_symlinks are now handled automatically
|
| 153 |
)
|
|
|
|
| 155 |
logger.info(f"✅ ChromaDB download from HF Dataset complete!")
|
| 156 |
logger.info(f"Downloaded to: {downloaded_path}")
|
| 157 |
|
| 158 |
+
# The HF dataset contains a 'chromadb' subdirectory with the actual ChromaDB files
|
| 159 |
+
chromadb_subdir = os.path.join(temp_download_path, "chromadb")
|
| 160 |
+
|
| 161 |
+
if os.path.exists(chromadb_subdir):
|
| 162 |
+
logger.info(f"📁 Found ChromaDB subdirectory: {chromadb_subdir}")
|
| 163 |
+
|
| 164 |
+
# Move the ChromaDB files from the subdirectory to the target location
|
| 165 |
+
import shutil
|
| 166 |
+
if os.path.exists(local_path):
|
| 167 |
+
shutil.rmtree(local_path)
|
| 168 |
+
shutil.move(chromadb_subdir, local_path)
|
| 169 |
+
|
| 170 |
+
# Clean up the temporary download directory
|
| 171 |
+
shutil.rmtree(temp_download_path)
|
| 172 |
+
|
| 173 |
+
logger.info(f"✅ ChromaDB files moved to: {local_path}")
|
| 174 |
+
else:
|
| 175 |
+
logger.error(f"❌ ChromaDB subdirectory not found in downloaded data: {chromadb_subdir}")
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
# Verify the final ChromaDB structure
|
| 179 |
if os.path.exists(local_path) and os.listdir(local_path):
|
| 180 |
file_count = sum(len(files) for _, _, files in os.walk(local_path))
|
| 181 |
total_size = sum(
|
|
|
|
| 184 |
for filename in filenames
|
| 185 |
) / (1024 * 1024 * 1024) # Convert to GB
|
| 186 |
|
| 187 |
+
logger.info(f"📊 ChromaDB verification:")
|
| 188 |
logger.info(f" Files: {file_count}")
|
| 189 |
logger.info(f" Total size: {total_size:.2f} GB")
|
| 190 |
|
| 191 |
+
# Check for key ChromaDB files
|
| 192 |
+
sqlite_file = os.path.join(local_path, "chroma.sqlite3")
|
| 193 |
+
if os.path.exists(sqlite_file):
|
| 194 |
+
sqlite_size = os.path.getsize(sqlite_file) / (1024 * 1024 * 1024)
|
| 195 |
+
logger.info(f" SQLite database: {sqlite_size:.2f} GB")
|
| 196 |
+
return True
|
| 197 |
+
else:
|
| 198 |
+
logger.error("❌ chroma.sqlite3 not found in ChromaDB directory")
|
| 199 |
+
return False
|
| 200 |
else:
|
| 201 |
+
logger.error("❌ ChromaDB directory is empty after processing")
|
| 202 |
return False
|
| 203 |
|
| 204 |
except ImportError:
|
|
|
|
| 243 |
if count > 0:
|
| 244 |
# Get a sample item
|
| 245 |
logger.info("🔍 Fetching sample items...")
|
| 246 |
+
sample = collection.get(limit=3, include=["documents", "metadatas"])
|
| 247 |
|
| 248 |
+
if sample.get('ids'):
|
| 249 |
+
logger.info(f"📝 Sample IDs: {sample['ids']}")
|
| 250 |
|
| 251 |
+
if sample.get('documents'):
|
| 252 |
logger.info(f"📄 Sample document (first 200 chars):")
|
| 253 |
logger.info(f" {sample['documents'][0][:200]}...")
|
| 254 |
|
| 255 |
+
if sample.get('metadatas'):
|
| 256 |
logger.info(f"🏷️ Sample metadata:")
|
| 257 |
for i, metadata in enumerate(sample['metadatas'][:2]):
|
| 258 |
logger.info(f" Item {i}: {metadata}")
|
uv.lock
CHANGED
|
@@ -191,34 +191,6 @@ wheels = [
|
|
| 191 |
{ url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
|
| 192 |
]
|
| 193 |
|
| 194 |
-
[[package]]
|
| 195 |
-
name = "boto3"
|
| 196 |
-
version = "1.40.40"
|
| 197 |
-
source = { registry = "https://pypi.org/simple" }
|
| 198 |
-
dependencies = [
|
| 199 |
-
{ name = "botocore" },
|
| 200 |
-
{ name = "jmespath" },
|
| 201 |
-
{ name = "s3transfer" },
|
| 202 |
-
]
|
| 203 |
-
sdist = { url = "https://files.pythonhosted.org/packages/3c/12/1a31b36802d0f33bc6982ab8b7e6437d75ef3c179abe6c53d4d8f7b4248f/boto3-1.40.40.tar.gz", hash = "sha256:f384d3a0410d0f1a4d4ae7aa69c41d0549c6ca5a76667dc25fc97d50ad6db740", size = 111606, upload-time = "2025-09-26T19:23:46.923Z" }
|
| 204 |
-
wheels = [
|
| 205 |
-
{ url = "https://files.pythonhosted.org/packages/90/69/c65566dbdaaea3af0c23f7731ab0f185a38b593fd449d2423374150dbfe0/boto3-1.40.40-py3-none-any.whl", hash = "sha256:385904de68623e1c341bdc095d94a30006843032c912adeb1e0752a343632ec6", size = 139340, upload-time = "2025-09-26T19:23:45.557Z" },
|
| 206 |
-
]
|
| 207 |
-
|
| 208 |
-
[[package]]
|
| 209 |
-
name = "botocore"
|
| 210 |
-
version = "1.40.40"
|
| 211 |
-
source = { registry = "https://pypi.org/simple" }
|
| 212 |
-
dependencies = [
|
| 213 |
-
{ name = "jmespath" },
|
| 214 |
-
{ name = "python-dateutil" },
|
| 215 |
-
{ name = "urllib3" },
|
| 216 |
-
]
|
| 217 |
-
sdist = { url = "https://files.pythonhosted.org/packages/83/5a/43a7fea503ad14fa79819f2b3103a38977fb587a3663d1ac6e958fccf592/botocore-1.40.40.tar.gz", hash = "sha256:78eb121a16a6481ed0f6e1aebe53a4f23aa121f34466846c13a5ca48fa980e31", size = 14363370, upload-time = "2025-09-26T19:23:37.853Z" }
|
| 218 |
-
wheels = [
|
| 219 |
-
{ url = "https://files.pythonhosted.org/packages/ed/5e/3bbf6d34cbf307c1b9e58e0204ceba2d35bbc0c93b4e3b3cc895aae0a5fd/botocore-1.40.40-py3-none-any.whl", hash = "sha256:68506142b3cde93145ef3ee0268f2444f2b68ada225a151f714092bbd3d6516a", size = 14031738, upload-time = "2025-09-26T19:23:35.475Z" },
|
| 220 |
-
]
|
| 221 |
-
|
| 222 |
[[package]]
|
| 223 |
name = "build"
|
| 224 |
version = "1.3.0"
|
|
@@ -845,15 +817,6 @@ wheels = [
|
|
| 845 |
{ url = "https://files.pythonhosted.org/packages/af/22/7ab7b4ec3a1c1f03aef376af11d23b05abcca3fb31fbca1e7557053b1ba2/jiter-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2bbf24f16ba5ad4441a9845e40e4ea0cb9eed00e76ba94050664ef53ef4406", size = 347102, upload-time = "2025-09-15T09:20:20.16Z" },
|
| 846 |
]
|
| 847 |
|
| 848 |
-
[[package]]
|
| 849 |
-
name = "jmespath"
|
| 850 |
-
version = "1.0.1"
|
| 851 |
-
source = { registry = "https://pypi.org/simple" }
|
| 852 |
-
sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" }
|
| 853 |
-
wheels = [
|
| 854 |
-
{ url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" },
|
| 855 |
-
]
|
| 856 |
-
|
| 857 |
[[package]]
|
| 858 |
name = "joblib"
|
| 859 |
version = "1.5.2"
|
|
@@ -1113,9 +1076,9 @@ name = "legisqa-local"
|
|
| 1113 |
version = "0.1.0"
|
| 1114 |
source = { editable = "." }
|
| 1115 |
dependencies = [
|
| 1116 |
-
{ name = "boto3" },
|
| 1117 |
{ name = "chromadb" },
|
| 1118 |
{ name = "datasets" },
|
|
|
|
| 1119 |
{ name = "langchain" },
|
| 1120 |
{ name = "langchain-anthropic" },
|
| 1121 |
{ name = "langchain-chroma" },
|
|
@@ -1132,9 +1095,9 @@ dependencies = [
|
|
| 1132 |
|
| 1133 |
[package.metadata]
|
| 1134 |
requires-dist = [
|
| 1135 |
-
{ name = "boto3", specifier = ">=1.35.0" },
|
| 1136 |
{ name = "chromadb", specifier = ">=1.1.0" },
|
| 1137 |
{ name = "datasets", specifier = ">=3.0.0" },
|
|
|
|
| 1138 |
{ name = "langchain", specifier = ">=0.3.27" },
|
| 1139 |
{ name = "langchain-anthropic", specifier = ">=0.3.19" },
|
| 1140 |
{ name = "langchain-chroma", specifier = ">=0.1.4" },
|
|
@@ -2390,18 +2353,6 @@ wheels = [
|
|
| 2390 |
{ url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
|
| 2391 |
]
|
| 2392 |
|
| 2393 |
-
[[package]]
|
| 2394 |
-
name = "s3transfer"
|
| 2395 |
-
version = "0.14.0"
|
| 2396 |
-
source = { registry = "https://pypi.org/simple" }
|
| 2397 |
-
dependencies = [
|
| 2398 |
-
{ name = "botocore" },
|
| 2399 |
-
]
|
| 2400 |
-
sdist = { url = "https://files.pythonhosted.org/packages/62/74/8d69dcb7a9efe8baa2046891735e5dfe433ad558ae23d9e3c14c633d1d58/s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125", size = 151547, upload-time = "2025-09-09T19:23:31.089Z" }
|
| 2401 |
-
wheels = [
|
| 2402 |
-
{ url = "https://files.pythonhosted.org/packages/48/f0/ae7ca09223a81a1d890b2557186ea015f6e0502e9b8cb8e1813f1d8cfa4e/s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456", size = 85712, upload-time = "2025-09-09T19:23:30.041Z" },
|
| 2403 |
-
]
|
| 2404 |
-
|
| 2405 |
[[package]]
|
| 2406 |
name = "safetensors"
|
| 2407 |
version = "0.6.2"
|
|
|
|
| 191 |
{ url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
|
| 192 |
]
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
[[package]]
|
| 195 |
name = "build"
|
| 196 |
version = "1.3.0"
|
|
|
|
| 817 |
{ url = "https://files.pythonhosted.org/packages/af/22/7ab7b4ec3a1c1f03aef376af11d23b05abcca3fb31fbca1e7557053b1ba2/jiter-0.11.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2bbf24f16ba5ad4441a9845e40e4ea0cb9eed00e76ba94050664ef53ef4406", size = 347102, upload-time = "2025-09-15T09:20:20.16Z" },
|
| 818 |
]
|
| 819 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
[[package]]
|
| 821 |
name = "joblib"
|
| 822 |
version = "1.5.2"
|
|
|
|
| 1076 |
version = "0.1.0"
|
| 1077 |
source = { editable = "." }
|
| 1078 |
dependencies = [
|
|
|
|
| 1079 |
{ name = "chromadb" },
|
| 1080 |
{ name = "datasets" },
|
| 1081 |
+
{ name = "huggingface-hub" },
|
| 1082 |
{ name = "langchain" },
|
| 1083 |
{ name = "langchain-anthropic" },
|
| 1084 |
{ name = "langchain-chroma" },
|
|
|
|
| 1095 |
|
| 1096 |
[package.metadata]
|
| 1097 |
requires-dist = [
|
|
|
|
| 1098 |
{ name = "chromadb", specifier = ">=1.1.0" },
|
| 1099 |
{ name = "datasets", specifier = ">=3.0.0" },
|
| 1100 |
+
{ name = "huggingface-hub", specifier = ">=0.19.0" },
|
| 1101 |
{ name = "langchain", specifier = ">=0.3.27" },
|
| 1102 |
{ name = "langchain-anthropic", specifier = ">=0.3.19" },
|
| 1103 |
{ name = "langchain-chroma", specifier = ">=0.1.4" },
|
|
|
|
| 2353 |
{ url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" },
|
| 2354 |
]
|
| 2355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2356 |
[[package]]
|
| 2357 |
name = "safetensors"
|
| 2358 |
version = "0.6.2"
|