n8n-dan

Sleeping

App Files Files

n8n-dan / scripts /sync-knowledge.sh

danilonovais

Initial commit: n8n infrastructure with AI and automation

89f19e4 23 days ago

raw

history blame

11 kB

	#!/bin/bash

	# Knowledge Base Synchronization Script
	# Syncs content from multiple GitHub repositories and generates embeddings
	# Usage: ./sync-knowledge.sh

	set -euo pipefail

	# Configuration
	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
	KNOWLEDGE_DIR="$PROJECT_ROOT/knowledge"
	TEMP_DIR="/tmp/kb-sync-$$"

	# Colors for output
	RED='\033[0;31m'
	GREEN='\033[0;32m'
	YELLOW='\033[1;33m'
	BLUE='\033[0;34m'
	NC='\033[0m' # No Color

	log_info() {
	echo -e "${GREEN}[INFO]${NC} $1"
	}

	log_warn() {
	echo -e "${YELLOW}[WARN]${NC} $1"
	}

	log_error() {
	echo -e "${RED}[ERROR]${NC} $1"
	}

	log_debug() {
	echo -e "${BLUE}[DEBUG]${NC} $1"
	}

	# Cleanup function
	cleanup() {
	log_info "Cleaning up temporary files..."
	rm -rf "$TEMP_DIR"
	}

	# Set trap for cleanup
	trap cleanup EXIT

	# Check dependencies
	check_dependencies() {
	local deps=("git" "curl" "jq")

	for dep in "${deps[@]}"; do
	if ! command -v "$dep" > /dev/null; then
	log_error "Required dependency not found: $dep"
	exit 1
	fi
	done
	}

	# Load environment variables
	load_env() {
	if [[ -f "$PROJECT_ROOT/.env" ]]; then
	source "$PROJECT_ROOT/.env"
	else
	log_error ".env file not found. Copy .env.example and configure it."
	exit 1
	fi
	}

	# Clone or update repository
	sync_repository() {
	local repo_url="$1"
	local target_path="$2"
	local branch="${3:-main}"
	local subpath="$4"

	log_info "Syncing repository: $repo_url"
	log_debug "Target: $target_path, Branch: $branch, Subpath: $subpath"

	local repo_name=$(basename "$repo_url" .git)
	local temp_repo_path="$TEMP_DIR/$repo_name"

	# Clone repository to temp directory
	git clone --depth 1 --branch "$branch" "$repo_url" "$temp_repo_path" \|\| {
	log_error "Failed to clone repository: $repo_url"
	return 1
	}

	# Copy specific subpath to target
	local source_path="$temp_repo_path/$subpath"
	if [[ -d "$source_path" ]]; then
	mkdir -p "$(dirname "$target_path")"
	cp -r "$source_path/." "$target_path/"
	log_info "Successfully synced to: $target_path"
	else
	log_warn "Subpath not found: $subpath in $repo_url"
	return 1
	fi
	}

	# Generate embeddings for knowledge content
	generate_embeddings() {
	local knowledge_path="$1"
	local collection_name="$2"

	log_info "Generating embeddings for: $collection_name"

	# Create Python script for embedding generation
	cat > "$TEMP_DIR/generate_embeddings.py" << 'EOF'
	import os
	import json
	import sys
	from pathlib import Path
	import hashlib
	import requests
	from sentence_transformers import SentenceTransformer

	def load_model():
	"""Load sentence transformer model"""
	try:
	model = SentenceTransformer('all-MiniLM-L6-v2')
	return model
	except Exception as e:
	print(f"Error loading model: {e}")
	return None

	def process_text_files(knowledge_path, collection_name):
	"""Process text files and generate embeddings"""
	model = load_model()
	if not model:
	return False

	embeddings_data = []
	knowledge_path = Path(knowledge_path)

	# Process markdown and text files
	for file_path in knowledge_path.rglob("*.md"):
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Generate embedding
	embedding = model.encode(content).tolist()

	# Create document metadata
	doc_id = hashlib.md5(str(file_path).encode()).hexdigest()

	embeddings_data.append({
	"id": doc_id,
	"content": content,
	"embedding": embedding,
	"metadata": {
	"file_path": str(file_path.relative_to(knowledge_path)),
	"file_name": file_path.name,
	"collection": collection_name,
	"content_type": "markdown",
	"size": len(content)
	}
	})

	except Exception as e:
	print(f"Error processing file {file_path}: {e}")

	# Save embeddings to JSON file
	output_file = knowledge_path / f"{collection_name}_embeddings.json"
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(embeddings_data, f, indent=2, ensure_ascii=False)

	print(f"Generated {len(embeddings_data)} embeddings for {collection_name}")
	return True

	if __name__ == "__main__":
	if len(sys.argv) != 3:
	print("Usage: python generate_embeddings.py <knowledge_path> <collection_name>")
	sys.exit(1)

	knowledge_path = sys.argv[1]
	collection_name = sys.argv[2]

	if process_text_files(knowledge_path, collection_name):
	print("Embedding generation completed successfully")
	else:
	print("Embedding generation failed")
	sys.exit(1)
	EOF

	# Run embedding generation
	python3 "$TEMP_DIR/generate_embeddings.py" "$knowledge_path" "$collection_name" \|\| {
	log_error "Failed to generate embeddings for $collection_name"
	return 1
	}
	}

	# Upload embeddings to vector store
	upload_embeddings() {
	local embeddings_file="$1"
	local collection_name="$2"

	log_info "Uploading embeddings to vector store: $collection_name"

	if [[ ! -f "$embeddings_file" ]]; then
	log_error "Embeddings file not found: $embeddings_file"
	return 1
	fi

	# Upload to ChromaDB
	local chroma_url="http://${CHROMA_HOST:-localhost}:${CHROMA_PORT:-8000}"

	curl -X POST "$chroma_url/api/v1/collections" \
	-H "Content-Type: application/json" \
	-H "Authorization: Bearer ${CHROMA_AUTH_TOKEN}" \
	-d "{\"name\": \"$collection_name\"}" \|\| true

	# Process and upload embeddings in batches
	python3 - << EOF
	import json
	import requests
	import sys
	from pathlib import Path

	def upload_batch(embeddings_data, collection_name, chroma_url, auth_token):
	"""Upload embeddings in batches to ChromaDB"""
	batch_size = 100
	total_docs = len(embeddings_data)

	for i in range(0, total_docs, batch_size):
	batch = embeddings_data[i:i+batch_size]

	# Prepare batch data for ChromaDB
	ids = [doc["id"] for doc in batch]
	embeddings = [doc["embedding"] for doc in batch]
	metadatas = [doc["metadata"] for doc in batch]
	documents = [doc["content"] for doc in batch]

	payload = {
	"ids": ids,
	"embeddings": embeddings,
	"metadatas": metadatas,
	"documents": documents
	}

	try:
	response = requests.post(
	f"{chroma_url}/api/v1/collections/{collection_name}/add",
	json=payload,
	headers={
	"Content-Type": "application/json",
	"Authorization": f"Bearer {auth_token}"
	},
	timeout=30
	)

	if response.status_code == 200:
	print(f"Uploaded batch {i//batch_size + 1} ({len(batch)} documents)")
	else:
	print(f"Error uploading batch {i//batch_size + 1}: {response.status_code}")
	print(f"Response: {response.text}")

	except Exception as e:
	print(f"Error uploading batch {i//batch_size + 1}: {e}")
	continue

	# Load and upload embeddings
	embeddings_file = "$embeddings_file"
	collection_name = "$collection_name"
	chroma_url = "$chroma_url"
	auth_token = "${CHROMA_AUTH_TOKEN:-}"

	try:
	with open(embeddings_file, 'r', encoding='utf-8') as f:
	embeddings_data = json.load(f)

	upload_batch(embeddings_data, collection_name, chroma_url, auth_token)
	print(f"Successfully uploaded {len(embeddings_data)} embeddings to {collection_name}")

	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)
	EOF
	}

	# Sync all knowledge repositories
	sync_all_repositories() {
	log_info "Starting knowledge base synchronization..."

	mkdir -p "$TEMP_DIR"

	# Repository configurations
	declare -A repos=(
	["n8n"]="${KB_REPO_N8N:-}:${KB_PATH_N8N:-projects/n8n}"
	["videos-e-animacoes"]="${KB_REPO_N8N:-}:${KB_PATH_VIDEOS:-projects/videos-e-animacoes}"
	["midjourney-prompt"]="${KB_REPO_N8N:-}:${KB_PATH_MIDJOURNEY:-projects/midjorney-prompt}"
	)

	for collection in "${!repos[@]}"; do
	local repo_config="${repos[$collection]}"
	local repo_url=$(echo "$repo_config" \| cut -d':' -f1)
	local subpath=$(echo "$repo_config" \| cut -d':' -f2)
	local target_path="$KNOWLEDGE_DIR/$collection"

	if [[ -n "$repo_url" ]]; then
	log_info "Syncing collection: $collection"

	# Sync repository
	sync_repository "$repo_url" "$target_path" "${KB_BRANCH_N8N:-main}" "$subpath"

	# Generate embeddings
	generate_embeddings "$target_path" "$collection"

	# Upload to vector store
	local embeddings_file="$target_path/${collection}_embeddings.json"
	if [[ -f "$embeddings_file" ]]; then
	upload_embeddings "$embeddings_file" "$collection"
	fi

	else
	log_warn "Repository URL not configured for collection: $collection"
	fi
	done
	}

	# Update n8n with new knowledge
	update_n8n_knowledge() {
	log_info "Notifying n8n of knowledge base updates..."

	# Create a webhook trigger to refresh knowledge in n8n workflows
	if [[ -n "${WEBHOOK_URL:-}" ]]; then
	local webhook_endpoint="$WEBHOOK_URL/webhook/knowledge-sync"

	curl -X POST "$webhook_endpoint" \
	-H "Content-Type: application/json" \
	-d "{\"event\": \"knowledge_updated\", \"timestamp\": \"$(date -Iseconds)\"}" \
	> /dev/null 2>&1 \|\| {
	log_warn "Failed to notify n8n of knowledge updates"
	}
	fi
	}

	# Main synchronization process
	main() {
	log_info "Starting knowledge base synchronization"

	# Preliminary checks
	check_dependencies
	load_env

	# Create knowledge directories
	mkdir -p "$KNOWLEDGE_DIR"/{n8n,videos-e-animacoes,midjourney-prompt}

	# Sync all repositories
	sync_all_repositories

	# Update n8n
	update_n8n_knowledge

	log_info "Knowledge base synchronization completed"

	# Generate summary
	log_info "Synchronization Summary:"
	find "$KNOWLEDGE_DIR" -name "*_embeddings.json" -exec basename {} \; \| while read file; do
	local collection=$(echo "$file" \| sed 's/_embeddings.json//')
	local count=$(jq '. \| length' "$KNOWLEDGE_DIR/$collection/$file" 2>/dev/null \|\| echo "0")
	log_info " - $collection: $count documents"
	done
	}

	# Run main function
	main "$@"