Spaces:
Sleeping
Sleeping
# Knowledge Base Synchronization Script | |
# Syncs content from multiple GitHub repositories and generates embeddings | |
# Usage: ./sync-knowledge.sh | |
set -euo pipefail | |
# Configuration | |
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" | |
KNOWLEDGE_DIR="$PROJECT_ROOT/knowledge" | |
TEMP_DIR="/tmp/kb-sync-$$" | |
# Colors for output | |
RED='\033[0;31m' | |
GREEN='\033[0;32m' | |
YELLOW='\033[1;33m' | |
BLUE='\033[0;34m' | |
NC='\033[0m' # No Color | |
log_info() { | |
echo -e "${GREEN}[INFO]${NC} $1" | |
} | |
log_warn() { | |
echo -e "${YELLOW}[WARN]${NC} $1" | |
} | |
log_error() { | |
echo -e "${RED}[ERROR]${NC} $1" | |
} | |
log_debug() { | |
echo -e "${BLUE}[DEBUG]${NC} $1" | |
} | |
# Cleanup function | |
cleanup() { | |
log_info "Cleaning up temporary files..." | |
rm -rf "$TEMP_DIR" | |
} | |
# Set trap for cleanup | |
trap cleanup EXIT | |
# Check dependencies | |
check_dependencies() { | |
local deps=("git" "curl" "jq") | |
for dep in "${deps[@]}"; do | |
if ! command -v "$dep" > /dev/null; then | |
log_error "Required dependency not found: $dep" | |
exit 1 | |
fi | |
done | |
} | |
# Load environment variables | |
load_env() { | |
if [[ -f "$PROJECT_ROOT/.env" ]]; then | |
source "$PROJECT_ROOT/.env" | |
else | |
log_error ".env file not found. Copy .env.example and configure it." | |
exit 1 | |
fi | |
} | |
# Clone or update repository | |
sync_repository() { | |
local repo_url="$1" | |
local target_path="$2" | |
local branch="${3:-main}" | |
local subpath="$4" | |
log_info "Syncing repository: $repo_url" | |
log_debug "Target: $target_path, Branch: $branch, Subpath: $subpath" | |
local repo_name=$(basename "$repo_url" .git) | |
local temp_repo_path="$TEMP_DIR/$repo_name" | |
# Clone repository to temp directory | |
git clone --depth 1 --branch "$branch" "$repo_url" "$temp_repo_path" || { | |
log_error "Failed to clone repository: $repo_url" | |
return 1 | |
} | |
# Copy specific subpath to target | |
local source_path="$temp_repo_path/$subpath" | |
if [[ -d "$source_path" ]]; then | |
mkdir -p "$(dirname "$target_path")" | |
cp -r "$source_path/." "$target_path/" | |
log_info "Successfully synced to: $target_path" | |
else | |
log_warn "Subpath not found: $subpath in $repo_url" | |
return 1 | |
fi | |
} | |
# Generate embeddings for knowledge content | |
generate_embeddings() { | |
local knowledge_path="$1" | |
local collection_name="$2" | |
log_info "Generating embeddings for: $collection_name" | |
# Create Python script for embedding generation | |
cat > "$TEMP_DIR/generate_embeddings.py" << 'EOF' | |
import os | |
import json | |
import sys | |
from pathlib import Path | |
import hashlib | |
import requests | |
from sentence_transformers import SentenceTransformer | |
def load_model(): | |
"""Load sentence transformer model""" | |
try: | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
return model | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
return None | |
def process_text_files(knowledge_path, collection_name): | |
"""Process text files and generate embeddings""" | |
model = load_model() | |
if not model: | |
return False | |
embeddings_data = [] | |
knowledge_path = Path(knowledge_path) | |
# Process markdown and text files | |
for file_path in knowledge_path.rglob("*.md"): | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
# Generate embedding | |
embedding = model.encode(content).tolist() | |
# Create document metadata | |
doc_id = hashlib.md5(str(file_path).encode()).hexdigest() | |
embeddings_data.append({ | |
"id": doc_id, | |
"content": content, | |
"embedding": embedding, | |
"metadata": { | |
"file_path": str(file_path.relative_to(knowledge_path)), | |
"file_name": file_path.name, | |
"collection": collection_name, | |
"content_type": "markdown", | |
"size": len(content) | |
} | |
}) | |
except Exception as e: | |
print(f"Error processing file {file_path}: {e}") | |
# Save embeddings to JSON file | |
output_file = knowledge_path / f"{collection_name}_embeddings.json" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(embeddings_data, f, indent=2, ensure_ascii=False) | |
print(f"Generated {len(embeddings_data)} embeddings for {collection_name}") | |
return True | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python generate_embeddings.py <knowledge_path> <collection_name>") | |
sys.exit(1) | |
knowledge_path = sys.argv[1] | |
collection_name = sys.argv[2] | |
if process_text_files(knowledge_path, collection_name): | |
print("Embedding generation completed successfully") | |
else: | |
print("Embedding generation failed") | |
sys.exit(1) | |
EOF | |
# Run embedding generation | |
python3 "$TEMP_DIR/generate_embeddings.py" "$knowledge_path" "$collection_name" || { | |
log_error "Failed to generate embeddings for $collection_name" | |
return 1 | |
} | |
} | |
# Upload embeddings to vector store | |
upload_embeddings() { | |
local embeddings_file="$1" | |
local collection_name="$2" | |
log_info "Uploading embeddings to vector store: $collection_name" | |
if [[ ! -f "$embeddings_file" ]]; then | |
log_error "Embeddings file not found: $embeddings_file" | |
return 1 | |
fi | |
# Upload to ChromaDB | |
local chroma_url="http://${CHROMA_HOST:-localhost}:${CHROMA_PORT:-8000}" | |
curl -X POST "$chroma_url/api/v1/collections" \ | |
-H "Content-Type: application/json" \ | |
-H "Authorization: Bearer ${CHROMA_AUTH_TOKEN}" \ | |
-d "{\"name\": \"$collection_name\"}" || true | |
# Process and upload embeddings in batches | |
python3 - << EOF | |
import json | |
import requests | |
import sys | |
from pathlib import Path | |
def upload_batch(embeddings_data, collection_name, chroma_url, auth_token): | |
"""Upload embeddings in batches to ChromaDB""" | |
batch_size = 100 | |
total_docs = len(embeddings_data) | |
for i in range(0, total_docs, batch_size): | |
batch = embeddings_data[i:i+batch_size] | |
# Prepare batch data for ChromaDB | |
ids = [doc["id"] for doc in batch] | |
embeddings = [doc["embedding"] for doc in batch] | |
metadatas = [doc["metadata"] for doc in batch] | |
documents = [doc["content"] for doc in batch] | |
payload = { | |
"ids": ids, | |
"embeddings": embeddings, | |
"metadatas": metadatas, | |
"documents": documents | |
} | |
try: | |
response = requests.post( | |
f"{chroma_url}/api/v1/collections/{collection_name}/add", | |
json=payload, | |
headers={ | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {auth_token}" | |
}, | |
timeout=30 | |
) | |
if response.status_code == 200: | |
print(f"Uploaded batch {i//batch_size + 1} ({len(batch)} documents)") | |
else: | |
print(f"Error uploading batch {i//batch_size + 1}: {response.status_code}") | |
print(f"Response: {response.text}") | |
except Exception as e: | |
print(f"Error uploading batch {i//batch_size + 1}: {e}") | |
continue | |
# Load and upload embeddings | |
embeddings_file = "$embeddings_file" | |
collection_name = "$collection_name" | |
chroma_url = "$chroma_url" | |
auth_token = "${CHROMA_AUTH_TOKEN:-}" | |
try: | |
with open(embeddings_file, 'r', encoding='utf-8') as f: | |
embeddings_data = json.load(f) | |
upload_batch(embeddings_data, collection_name, chroma_url, auth_token) | |
print(f"Successfully uploaded {len(embeddings_data)} embeddings to {collection_name}") | |
except Exception as e: | |
print(f"Error: {e}") | |
sys.exit(1) | |
EOF | |
} | |
# Sync all knowledge repositories | |
sync_all_repositories() { | |
log_info "Starting knowledge base synchronization..." | |
mkdir -p "$TEMP_DIR" | |
# Repository configurations | |
declare -A repos=( | |
["n8n"]="${KB_REPO_N8N:-}:${KB_PATH_N8N:-projects/n8n}" | |
["videos-e-animacoes"]="${KB_REPO_N8N:-}:${KB_PATH_VIDEOS:-projects/videos-e-animacoes}" | |
["midjourney-prompt"]="${KB_REPO_N8N:-}:${KB_PATH_MIDJOURNEY:-projects/midjorney-prompt}" | |
) | |
for collection in "${!repos[@]}"; do | |
local repo_config="${repos[$collection]}" | |
local repo_url=$(echo "$repo_config" | cut -d':' -f1) | |
local subpath=$(echo "$repo_config" | cut -d':' -f2) | |
local target_path="$KNOWLEDGE_DIR/$collection" | |
if [[ -n "$repo_url" ]]; then | |
log_info "Syncing collection: $collection" | |
# Sync repository | |
sync_repository "$repo_url" "$target_path" "${KB_BRANCH_N8N:-main}" "$subpath" | |
# Generate embeddings | |
generate_embeddings "$target_path" "$collection" | |
# Upload to vector store | |
local embeddings_file="$target_path/${collection}_embeddings.json" | |
if [[ -f "$embeddings_file" ]]; then | |
upload_embeddings "$embeddings_file" "$collection" | |
fi | |
else | |
log_warn "Repository URL not configured for collection: $collection" | |
fi | |
done | |
} | |
# Update n8n with new knowledge | |
update_n8n_knowledge() { | |
log_info "Notifying n8n of knowledge base updates..." | |
# Create a webhook trigger to refresh knowledge in n8n workflows | |
if [[ -n "${WEBHOOK_URL:-}" ]]; then | |
local webhook_endpoint="$WEBHOOK_URL/webhook/knowledge-sync" | |
curl -X POST "$webhook_endpoint" \ | |
-H "Content-Type: application/json" \ | |
-d "{\"event\": \"knowledge_updated\", \"timestamp\": \"$(date -Iseconds)\"}" \ | |
> /dev/null 2>&1 || { | |
log_warn "Failed to notify n8n of knowledge updates" | |
} | |
fi | |
} | |
# Main synchronization process | |
main() { | |
log_info "Starting knowledge base synchronization" | |
# Preliminary checks | |
check_dependencies | |
load_env | |
# Create knowledge directories | |
mkdir -p "$KNOWLEDGE_DIR"/{n8n,videos-e-animacoes,midjourney-prompt} | |
# Sync all repositories | |
sync_all_repositories | |
# Update n8n | |
update_n8n_knowledge | |
log_info "Knowledge base synchronization completed" | |
# Generate summary | |
log_info "Synchronization Summary:" | |
find "$KNOWLEDGE_DIR" -name "*_embeddings.json" -exec basename {} \; | while read file; do | |
local collection=$(echo "$file" | sed 's/_embeddings.json//') | |
local count=$(jq '. | length' "$KNOWLEDGE_DIR/$collection/$file" 2>/dev/null || echo "0") | |
log_info " - $collection: $count documents" | |
done | |
} | |
# Run main function | |
main "$@" |