Spaces:
Sleeping
Sleeping
Commit
Β·
6f2f718
1
Parent(s):
b087f3c
more logs
Browse files
src/legisqa_local/config/settings.py
CHANGED
|
@@ -74,6 +74,8 @@ def setup_chromadb():
|
|
| 74 |
# Update environment variable to point to persistent storage
|
| 75 |
os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
|
| 76 |
logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
|
|
|
|
|
|
|
| 77 |
return persistent_chroma_path
|
| 78 |
|
| 79 |
# ChromaDB not found in persistent storage, try to download from HF Dataset
|
|
@@ -92,6 +94,8 @@ def setup_chromadb():
|
|
| 92 |
# Update environment variable to point to persistent storage
|
| 93 |
os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
|
| 94 |
logger.info(f"β
ChromaDB download successful! Updated path to: {persistent_chroma_path}")
|
|
|
|
|
|
|
| 95 |
return persistent_chroma_path
|
| 96 |
else:
|
| 97 |
logger.error("β ChromaDB download from HF Dataset failed!")
|
|
@@ -109,6 +113,8 @@ def setup_chromadb():
|
|
| 109 |
if os.path.exists(chroma_path):
|
| 110 |
if os.listdir(chroma_path):
|
| 111 |
logger.info(f"β
ChromaDB found at {chroma_path}")
|
|
|
|
|
|
|
| 112 |
return chroma_path
|
| 113 |
else:
|
| 114 |
logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
|
|
@@ -172,6 +178,67 @@ def download_chromadb_from_hf_dataset(dataset_repo: str, local_path: str) -> boo
|
|
| 172 |
logger.error(f"Exception type: {type(e).__name__}")
|
| 173 |
return False
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
# Embedding model configuration
|
| 176 |
EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
|
| 177 |
EMBEDDING_DEVICE = "cpu"
|
|
|
|
| 74 |
# Update environment variable to point to persistent storage
|
| 75 |
os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
|
| 76 |
logger.info(f"Updated CHROMA_PERSIST_DIRECTORY to: {persistent_chroma_path}")
|
| 77 |
+
# Inspect the existing ChromaDB
|
| 78 |
+
inspect_chromadb(persistent_chroma_path)
|
| 79 |
return persistent_chroma_path
|
| 80 |
|
| 81 |
# ChromaDB not found in persistent storage, try to download from HF Dataset
|
|
|
|
| 94 |
# Update environment variable to point to persistent storage
|
| 95 |
os.environ["CHROMA_PERSIST_DIRECTORY"] = persistent_chroma_path
|
| 96 |
logger.info(f"β
ChromaDB download successful! Updated path to: {persistent_chroma_path}")
|
| 97 |
+
# Inspect the downloaded ChromaDB
|
| 98 |
+
inspect_chromadb(persistent_chroma_path)
|
| 99 |
return persistent_chroma_path
|
| 100 |
else:
|
| 101 |
logger.error("β ChromaDB download from HF Dataset failed!")
|
|
|
|
| 113 |
if os.path.exists(chroma_path):
|
| 114 |
if os.listdir(chroma_path):
|
| 115 |
logger.info(f"β
ChromaDB found at {chroma_path}")
|
| 116 |
+
# Inspect the fallback ChromaDB
|
| 117 |
+
inspect_chromadb(chroma_path)
|
| 118 |
return chroma_path
|
| 119 |
else:
|
| 120 |
logger.warning(f"ChromaDB directory exists but is empty: {chroma_path}")
|
|
|
|
| 178 |
logger.error(f"Exception type: {type(e).__name__}")
|
| 179 |
return False
|
| 180 |
|
| 181 |
+
def inspect_chromadb(chroma_path: str):
|
| 182 |
+
"""Inspect ChromaDB collection to verify it's working correctly"""
|
| 183 |
+
logger.info("π === ChromaDB Collection Inspection ===")
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
from chromadb import PersistentClient
|
| 187 |
+
|
| 188 |
+
# Get collection name from config
|
| 189 |
+
chroma_config = get_chroma_config()
|
| 190 |
+
collection_name = chroma_config["collection_name"]
|
| 191 |
+
|
| 192 |
+
logger.info(f"π Collection name: {collection_name}")
|
| 193 |
+
logger.info(f"π ChromaDB path: {chroma_path}")
|
| 194 |
+
|
| 195 |
+
# Create ChromaDB client
|
| 196 |
+
client = PersistentClient(path=chroma_path)
|
| 197 |
+
logger.info("β
ChromaDB client created successfully")
|
| 198 |
+
|
| 199 |
+
# List all collections
|
| 200 |
+
collections = client.list_collections()
|
| 201 |
+
logger.info(f"π Available collections: {[c.name for c in collections]}")
|
| 202 |
+
|
| 203 |
+
# Get the specific collection
|
| 204 |
+
if collection_name in [c.name for c in collections]:
|
| 205 |
+
collection = client.get_collection(name=collection_name)
|
| 206 |
+
logger.info(f"β
Collection '{collection_name}' found")
|
| 207 |
+
|
| 208 |
+
# Get collection count
|
| 209 |
+
count = collection.count()
|
| 210 |
+
logger.info(f"π Collection count: {count} items")
|
| 211 |
+
|
| 212 |
+
if count > 0:
|
| 213 |
+
# Get a sample item
|
| 214 |
+
logger.info("π Fetching sample items...")
|
| 215 |
+
sample = collection.get(limit=3, include=["documents", "metadatas", "ids"])
|
| 216 |
+
|
| 217 |
+
logger.info(f"π Sample IDs: {sample['ids']}")
|
| 218 |
+
|
| 219 |
+
if sample['documents']:
|
| 220 |
+
logger.info(f"π Sample document (first 200 chars):")
|
| 221 |
+
logger.info(f" {sample['documents'][0][:200]}...")
|
| 222 |
+
|
| 223 |
+
if sample['metadatas']:
|
| 224 |
+
logger.info(f"π·οΈ Sample metadata:")
|
| 225 |
+
for i, metadata in enumerate(sample['metadatas'][:2]):
|
| 226 |
+
logger.info(f" Item {i}: {metadata}")
|
| 227 |
+
else:
|
| 228 |
+
logger.warning("β οΈ Collection is empty!")
|
| 229 |
+
|
| 230 |
+
else:
|
| 231 |
+
logger.error(f"β Collection '{collection_name}' not found!")
|
| 232 |
+
logger.error(f"Available collections: {[c.name for c in collections]}")
|
| 233 |
+
|
| 234 |
+
except ImportError as e:
|
| 235 |
+
logger.error(f"β Import error during ChromaDB inspection: {e}")
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logger.error(f"β Error inspecting ChromaDB: {e}")
|
| 238 |
+
logger.error(f"Exception type: {type(e).__name__}")
|
| 239 |
+
|
| 240 |
+
logger.info("π === ChromaDB Inspection Complete ===")
|
| 241 |
+
|
| 242 |
# Embedding model configuration
|
| 243 |
EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
|
| 244 |
EMBEDDING_DEVICE = "cpu"
|