n8n-dan / scripts /sync-knowledge.sh
danilonovais's picture
Initial commit: n8n infrastructure with AI and automation
89f19e4
raw
history blame
11 kB
#!/bin/bash
# Knowledge Base Synchronization Script
# Syncs content from multiple GitHub repositories and generates embeddings
# Usage: ./sync-knowledge.sh
set -euo pipefail
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
KNOWLEDGE_DIR="$PROJECT_ROOT/knowledge"
TEMP_DIR="/tmp/kb-sync-$$"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_debug() {
echo -e "${BLUE}[DEBUG]${NC} $1"
}
# Cleanup function
cleanup() {
log_info "Cleaning up temporary files..."
rm -rf "$TEMP_DIR"
}
# Set trap for cleanup
trap cleanup EXIT
# Check dependencies
check_dependencies() {
local deps=("git" "curl" "jq")
for dep in "${deps[@]}"; do
if ! command -v "$dep" > /dev/null; then
log_error "Required dependency not found: $dep"
exit 1
fi
done
}
# Load environment variables
load_env() {
if [[ -f "$PROJECT_ROOT/.env" ]]; then
source "$PROJECT_ROOT/.env"
else
log_error ".env file not found. Copy .env.example and configure it."
exit 1
fi
}
# Clone or update repository
sync_repository() {
local repo_url="$1"
local target_path="$2"
local branch="${3:-main}"
local subpath="$4"
log_info "Syncing repository: $repo_url"
log_debug "Target: $target_path, Branch: $branch, Subpath: $subpath"
local repo_name=$(basename "$repo_url" .git)
local temp_repo_path="$TEMP_DIR/$repo_name"
# Clone repository to temp directory
git clone --depth 1 --branch "$branch" "$repo_url" "$temp_repo_path" || {
log_error "Failed to clone repository: $repo_url"
return 1
}
# Copy specific subpath to target
local source_path="$temp_repo_path/$subpath"
if [[ -d "$source_path" ]]; then
mkdir -p "$(dirname "$target_path")"
cp -r "$source_path/." "$target_path/"
log_info "Successfully synced to: $target_path"
else
log_warn "Subpath not found: $subpath in $repo_url"
return 1
fi
}
# Generate embeddings for knowledge content
generate_embeddings() {
local knowledge_path="$1"
local collection_name="$2"
log_info "Generating embeddings for: $collection_name"
# Create Python script for embedding generation
cat > "$TEMP_DIR/generate_embeddings.py" << 'EOF'
import os
import json
import sys
from pathlib import Path
import hashlib
import requests
from sentence_transformers import SentenceTransformer
def load_model():
"""Load sentence transformer model"""
try:
model = SentenceTransformer('all-MiniLM-L6-v2')
return model
except Exception as e:
print(f"Error loading model: {e}")
return None
def process_text_files(knowledge_path, collection_name):
"""Process text files and generate embeddings"""
model = load_model()
if not model:
return False
embeddings_data = []
knowledge_path = Path(knowledge_path)
# Process markdown and text files
for file_path in knowledge_path.rglob("*.md"):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Generate embedding
embedding = model.encode(content).tolist()
# Create document metadata
doc_id = hashlib.md5(str(file_path).encode()).hexdigest()
embeddings_data.append({
"id": doc_id,
"content": content,
"embedding": embedding,
"metadata": {
"file_path": str(file_path.relative_to(knowledge_path)),
"file_name": file_path.name,
"collection": collection_name,
"content_type": "markdown",
"size": len(content)
}
})
except Exception as e:
print(f"Error processing file {file_path}: {e}")
# Save embeddings to JSON file
output_file = knowledge_path / f"{collection_name}_embeddings.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(embeddings_data, f, indent=2, ensure_ascii=False)
print(f"Generated {len(embeddings_data)} embeddings for {collection_name}")
return True
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python generate_embeddings.py <knowledge_path> <collection_name>")
sys.exit(1)
knowledge_path = sys.argv[1]
collection_name = sys.argv[2]
if process_text_files(knowledge_path, collection_name):
print("Embedding generation completed successfully")
else:
print("Embedding generation failed")
sys.exit(1)
EOF
# Run embedding generation
python3 "$TEMP_DIR/generate_embeddings.py" "$knowledge_path" "$collection_name" || {
log_error "Failed to generate embeddings for $collection_name"
return 1
}
}
# Upload embeddings to vector store
upload_embeddings() {
local embeddings_file="$1"
local collection_name="$2"
log_info "Uploading embeddings to vector store: $collection_name"
if [[ ! -f "$embeddings_file" ]]; then
log_error "Embeddings file not found: $embeddings_file"
return 1
fi
# Upload to ChromaDB
local chroma_url="http://${CHROMA_HOST:-localhost}:${CHROMA_PORT:-8000}"
curl -X POST "$chroma_url/api/v1/collections" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${CHROMA_AUTH_TOKEN}" \
-d "{\"name\": \"$collection_name\"}" || true
# Process and upload embeddings in batches
python3 - << EOF
import json
import requests
import sys
from pathlib import Path
def upload_batch(embeddings_data, collection_name, chroma_url, auth_token):
"""Upload embeddings in batches to ChromaDB"""
batch_size = 100
total_docs = len(embeddings_data)
for i in range(0, total_docs, batch_size):
batch = embeddings_data[i:i+batch_size]
# Prepare batch data for ChromaDB
ids = [doc["id"] for doc in batch]
embeddings = [doc["embedding"] for doc in batch]
metadatas = [doc["metadata"] for doc in batch]
documents = [doc["content"] for doc in batch]
payload = {
"ids": ids,
"embeddings": embeddings,
"metadatas": metadatas,
"documents": documents
}
try:
response = requests.post(
f"{chroma_url}/api/v1/collections/{collection_name}/add",
json=payload,
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {auth_token}"
},
timeout=30
)
if response.status_code == 200:
print(f"Uploaded batch {i//batch_size + 1} ({len(batch)} documents)")
else:
print(f"Error uploading batch {i//batch_size + 1}: {response.status_code}")
print(f"Response: {response.text}")
except Exception as e:
print(f"Error uploading batch {i//batch_size + 1}: {e}")
continue
# Load and upload embeddings
embeddings_file = "$embeddings_file"
collection_name = "$collection_name"
chroma_url = "$chroma_url"
auth_token = "${CHROMA_AUTH_TOKEN:-}"
try:
with open(embeddings_file, 'r', encoding='utf-8') as f:
embeddings_data = json.load(f)
upload_batch(embeddings_data, collection_name, chroma_url, auth_token)
print(f"Successfully uploaded {len(embeddings_data)} embeddings to {collection_name}")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
EOF
}
# Sync all knowledge repositories
sync_all_repositories() {
log_info "Starting knowledge base synchronization..."
mkdir -p "$TEMP_DIR"
# Repository configurations
declare -A repos=(
["n8n"]="${KB_REPO_N8N:-}:${KB_PATH_N8N:-projects/n8n}"
["videos-e-animacoes"]="${KB_REPO_N8N:-}:${KB_PATH_VIDEOS:-projects/videos-e-animacoes}"
["midjourney-prompt"]="${KB_REPO_N8N:-}:${KB_PATH_MIDJOURNEY:-projects/midjorney-prompt}"
)
for collection in "${!repos[@]}"; do
local repo_config="${repos[$collection]}"
local repo_url=$(echo "$repo_config" | cut -d':' -f1)
local subpath=$(echo "$repo_config" | cut -d':' -f2)
local target_path="$KNOWLEDGE_DIR/$collection"
if [[ -n "$repo_url" ]]; then
log_info "Syncing collection: $collection"
# Sync repository
sync_repository "$repo_url" "$target_path" "${KB_BRANCH_N8N:-main}" "$subpath"
# Generate embeddings
generate_embeddings "$target_path" "$collection"
# Upload to vector store
local embeddings_file="$target_path/${collection}_embeddings.json"
if [[ -f "$embeddings_file" ]]; then
upload_embeddings "$embeddings_file" "$collection"
fi
else
log_warn "Repository URL not configured for collection: $collection"
fi
done
}
# Update n8n with new knowledge
update_n8n_knowledge() {
log_info "Notifying n8n of knowledge base updates..."
# Create a webhook trigger to refresh knowledge in n8n workflows
if [[ -n "${WEBHOOK_URL:-}" ]]; then
local webhook_endpoint="$WEBHOOK_URL/webhook/knowledge-sync"
curl -X POST "$webhook_endpoint" \
-H "Content-Type: application/json" \
-d "{\"event\": \"knowledge_updated\", \"timestamp\": \"$(date -Iseconds)\"}" \
> /dev/null 2>&1 || {
log_warn "Failed to notify n8n of knowledge updates"
}
fi
}
# Main synchronization process
main() {
log_info "Starting knowledge base synchronization"
# Preliminary checks
check_dependencies
load_env
# Create knowledge directories
mkdir -p "$KNOWLEDGE_DIR"/{n8n,videos-e-animacoes,midjourney-prompt}
# Sync all repositories
sync_all_repositories
# Update n8n
update_n8n_knowledge
log_info "Knowledge base synchronization completed"
# Generate summary
log_info "Synchronization Summary:"
find "$KNOWLEDGE_DIR" -name "*_embeddings.json" -exec basename {} \; | while read file; do
local collection=$(echo "$file" | sed 's/_embeddings.json//')
local count=$(jq '. | length' "$KNOWLEDGE_DIR/$collection/$file" 2>/dev/null || echo "0")
log_info " - $collection: $count documents"
done
}
# Run main function
main "$@"