n8n-dan

Sleeping

File size: 11,035 Bytes

89f19e4

#!/bin/bash

# Knowledge Base Synchronization Script
# Syncs content from multiple GitHub repositories and generates embeddings
# Usage: ./sync-knowledge.sh

set -euo pipefail

# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
KNOWLEDGE_DIR="$PROJECT_ROOT/knowledge"
TEMP_DIR="/tmp/kb-sync-$$"

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

log_debug() {
    echo -e "${BLUE}[DEBUG]${NC} $1"
}

# Cleanup function
cleanup() {
    log_info "Cleaning up temporary files..."
    rm -rf "$TEMP_DIR"
}

# Set trap for cleanup
trap cleanup EXIT

# Check dependencies
check_dependencies() {
    local deps=("git" "curl" "jq")
    
    for dep in "${deps[@]}"; do
        if ! command -v "$dep" > /dev/null; then
            log_error "Required dependency not found: $dep"
            exit 1
        fi
    done
}

# Load environment variables
load_env() {
    if [[ -f "$PROJECT_ROOT/.env" ]]; then
        source "$PROJECT_ROOT/.env"
    else
        log_error ".env file not found. Copy .env.example and configure it."
        exit 1
    fi
}

# Clone or update repository
sync_repository() {
    local repo_url="$1"
    local target_path="$2"
    local branch="${3:-main}"
    local subpath="$4"
    
    log_info "Syncing repository: $repo_url"
    log_debug "Target: $target_path, Branch: $branch, Subpath: $subpath"
    
    local repo_name=$(basename "$repo_url" .git)
    local temp_repo_path="$TEMP_DIR/$repo_name"
    
    # Clone repository to temp directory
    git clone --depth 1 --branch "$branch" "$repo_url" "$temp_repo_path" || {
        log_error "Failed to clone repository: $repo_url"
        return 1
    }
    
    # Copy specific subpath to target
    local source_path="$temp_repo_path/$subpath"
    if [[ -d "$source_path" ]]; then
        mkdir -p "$(dirname "$target_path")"
        cp -r "$source_path/." "$target_path/"
        log_info "Successfully synced to: $target_path"
    else
        log_warn "Subpath not found: $subpath in $repo_url"
        return 1
    fi
}

# Generate embeddings for knowledge content
generate_embeddings() {
    local knowledge_path="$1"
    local collection_name="$2"
    
    log_info "Generating embeddings for: $collection_name"
    
    # Create Python script for embedding generation
    cat > "$TEMP_DIR/generate_embeddings.py" << 'EOF'
import os
import json
import sys
from pathlib import Path
import hashlib
import requests
from sentence_transformers import SentenceTransformer

def load_model():
    """Load sentence transformer model"""
    try:
        model = SentenceTransformer('all-MiniLM-L6-v2')
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def process_text_files(knowledge_path, collection_name):
    """Process text files and generate embeddings"""
    model = load_model()
    if not model:
        return False
    
    embeddings_data = []
    knowledge_path = Path(knowledge_path)
    
    # Process markdown and text files
    for file_path in knowledge_path.rglob("*.md"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Generate embedding
            embedding = model.encode(content).tolist()
            
            # Create document metadata
            doc_id = hashlib.md5(str(file_path).encode()).hexdigest()
            
            embeddings_data.append({
                "id": doc_id,
                "content": content,
                "embedding": embedding,
                "metadata": {
                    "file_path": str(file_path.relative_to(knowledge_path)),
                    "file_name": file_path.name,
                    "collection": collection_name,
                    "content_type": "markdown",
                    "size": len(content)
                }
            })
            
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
    
    # Save embeddings to JSON file
    output_file = knowledge_path / f"{collection_name}_embeddings.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(embeddings_data, f, indent=2, ensure_ascii=False)
    
    print(f"Generated {len(embeddings_data)} embeddings for {collection_name}")
    return True

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python generate_embeddings.py <knowledge_path> <collection_name>")
        sys.exit(1)
    
    knowledge_path = sys.argv[1]
    collection_name = sys.argv[2]
    
    if process_text_files(knowledge_path, collection_name):
        print("Embedding generation completed successfully")
    else:
        print("Embedding generation failed")
        sys.exit(1)
EOF

    # Run embedding generation
    python3 "$TEMP_DIR/generate_embeddings.py" "$knowledge_path" "$collection_name" || {
        log_error "Failed to generate embeddings for $collection_name"
        return 1
    }
}

# Upload embeddings to vector store
upload_embeddings() {
    local embeddings_file="$1"
    local collection_name="$2"
    
    log_info "Uploading embeddings to vector store: $collection_name"
    
    if [[ ! -f "$embeddings_file" ]]; then
        log_error "Embeddings file not found: $embeddings_file"
        return 1
    fi
    
    # Upload to ChromaDB
    local chroma_url="http://${CHROMA_HOST:-localhost}:${CHROMA_PORT:-8000}"
    
    curl -X POST "$chroma_url/api/v1/collections" \
        -H "Content-Type: application/json" \
        -H "Authorization: Bearer ${CHROMA_AUTH_TOKEN}" \
        -d "{\"name\": \"$collection_name\"}" || true
    
    # Process and upload embeddings in batches
    python3 - << EOF
import json
import requests
import sys
from pathlib import Path

def upload_batch(embeddings_data, collection_name, chroma_url, auth_token):
    """Upload embeddings in batches to ChromaDB"""
    batch_size = 100
    total_docs = len(embeddings_data)
    
    for i in range(0, total_docs, batch_size):
        batch = embeddings_data[i:i+batch_size]
        
        # Prepare batch data for ChromaDB
        ids = [doc["id"] for doc in batch]
        embeddings = [doc["embedding"] for doc in batch]
        metadatas = [doc["metadata"] for doc in batch]
        documents = [doc["content"] for doc in batch]
        
        payload = {
            "ids": ids,
            "embeddings": embeddings,
            "metadatas": metadatas,
            "documents": documents
        }
        
        try:
            response = requests.post(
                f"{chroma_url}/api/v1/collections/{collection_name}/add",
                json=payload,
                headers={
                    "Content-Type": "application/json",
                    "Authorization": f"Bearer {auth_token}"
                },
                timeout=30
            )
            
            if response.status_code == 200:
                print(f"Uploaded batch {i//batch_size + 1} ({len(batch)} documents)")
            else:
                print(f"Error uploading batch {i//batch_size + 1}: {response.status_code}")
                print(f"Response: {response.text}")
                
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
            continue

# Load and upload embeddings
embeddings_file = "$embeddings_file"
collection_name = "$collection_name"
chroma_url = "$chroma_url"
auth_token = "${CHROMA_AUTH_TOKEN:-}"

try:
    with open(embeddings_file, 'r', encoding='utf-8') as f:
        embeddings_data = json.load(f)
    
    upload_batch(embeddings_data, collection_name, chroma_url, auth_token)
    print(f"Successfully uploaded {len(embeddings_data)} embeddings to {collection_name}")
    
except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)
EOF
}

# Sync all knowledge repositories
sync_all_repositories() {
    log_info "Starting knowledge base synchronization..."
    
    mkdir -p "$TEMP_DIR"
    
    # Repository configurations
    declare -A repos=(
        ["n8n"]="${KB_REPO_N8N:-}:${KB_PATH_N8N:-projects/n8n}"
        ["videos-e-animacoes"]="${KB_REPO_N8N:-}:${KB_PATH_VIDEOS:-projects/videos-e-animacoes}"
        ["midjourney-prompt"]="${KB_REPO_N8N:-}:${KB_PATH_MIDJOURNEY:-projects/midjorney-prompt}"
    )
    
    for collection in "${!repos[@]}"; do
        local repo_config="${repos[$collection]}"
        local repo_url=$(echo "$repo_config" | cut -d':' -f1)
        local subpath=$(echo "$repo_config" | cut -d':' -f2)
        local target_path="$KNOWLEDGE_DIR/$collection"
        
        if [[ -n "$repo_url" ]]; then
            log_info "Syncing collection: $collection"
            
            # Sync repository
            sync_repository "$repo_url" "$target_path" "${KB_BRANCH_N8N:-main}" "$subpath"
            
            # Generate embeddings
            generate_embeddings "$target_path" "$collection"
            
            # Upload to vector store
            local embeddings_file="$target_path/${collection}_embeddings.json"
            if [[ -f "$embeddings_file" ]]; then
                upload_embeddings "$embeddings_file" "$collection"
            fi
            
        else
            log_warn "Repository URL not configured for collection: $collection"
        fi
    done
}

# Update n8n with new knowledge
update_n8n_knowledge() {
    log_info "Notifying n8n of knowledge base updates..."
    
    # Create a webhook trigger to refresh knowledge in n8n workflows
    if [[ -n "${WEBHOOK_URL:-}" ]]; then
        local webhook_endpoint="$WEBHOOK_URL/webhook/knowledge-sync"
        
        curl -X POST "$webhook_endpoint" \
            -H "Content-Type: application/json" \
            -d "{\"event\": \"knowledge_updated\", \"timestamp\": \"$(date -Iseconds)\"}" \
            > /dev/null 2>&1 || {
            log_warn "Failed to notify n8n of knowledge updates"
        }
    fi
}

# Main synchronization process
main() {
    log_info "Starting knowledge base synchronization"
    
    # Preliminary checks
    check_dependencies
    load_env
    
    # Create knowledge directories
    mkdir -p "$KNOWLEDGE_DIR"/{n8n,videos-e-animacoes,midjourney-prompt}
    
    # Sync all repositories
    sync_all_repositories
    
    # Update n8n
    update_n8n_knowledge
    
    log_info "Knowledge base synchronization completed"
    
    # Generate summary
    log_info "Synchronization Summary:"
    find "$KNOWLEDGE_DIR" -name "*_embeddings.json" -exec basename {} \; | while read file; do
        local collection=$(echo "$file" | sed 's/_embeddings.json//')
        local count=$(jq '. | length' "$KNOWLEDGE_DIR/$collection/$file" 2>/dev/null || echo "0")
        log_info "  - $collection: $count documents"
    done
}

# Run main function
main "$@"