""" Central configuration file for the Multi-Method RAG System. All shared parameters and settings are defined here. """ import os from pathlib import Path from dotenv import load_dotenv # Load environment variables load_dotenv(override=True) # ==================== Versioning and Date ==================== DATE = "August 13, 2025" VERSION = "2.0.1" # ==================== API Configuration ==================== OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_CHAT_MODEL = "gpt-5-chat-latest" # This is the non-reasoning model for gpt-5 so it has no latency OPENAI_EMBEDDING_MODEL = "text-embedding-3-large" # Options: text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002 # ==================== Realtime API Configuration ==================== # OpenAI Realtime API settings for speech-to-speech functionality OPENAI_REALTIME_MODEL = "gpt-4o-realtime-preview" # Realtime model for speech-to-speech REALTIME_VOICE = "alloy" # Available voices: alloy, echo, fable, onyx, nova, shimmer REALTIME_INSTRUCTIONS = ( "You are a knowledgeable safety expert speaking naturally in conversation. " "VOICE BEHAVIOR: " "- Speak like a confident safety professional talking to a colleague " "- Acknowledge what you heard: 'You're asking about [topic]...' " "- Use natural speech with appropriate pauses and emphasis " "- Sound authoritative and knowledgeable - you ARE the expert " "- Never mention document names, page numbers, or citation details when speaking " "- Just state the facts naturally as if you know them from your expertise " "RESPONSE PROCESS: " "1. Briefly acknowledge the question: 'You're asking about [topic]...' " "2. Call ask_rag to get the accurate information " "3. Speak the information naturally as YOUR expertise, not as 'according to document X' " "4. Organize complex topics: 'There are three key requirements here...' " "5. Be thorough but conversational - like explaining to a colleague " "CITATION RULE: " "NEVER mention specific documents, sources, or page numbers in speech. " "Just state the information confidently as if it's your professional knowledge. " "For example, don't say 'According to OSHA 1910.147...' - just say 'The lockout tagout requirements are...' " "IMPORTANT: Always use ask_rag for safety questions to get accurate information, " "but speak the results as your own expertise, not as citations." ) # ==================== Model Parameters ==================== # Generation parameters DEFAULT_TEMPERATURE = 0 # Range: 0.0-1.0 (0=deterministic, 1=creative) DEFAULT_MAX_TOKENS = 5000 # Maximum tokens in response DEFAULT_TOP_K = 5 # Number of chunks to retrieve by default DEFAULT_TOP_P = 1.0 # Nucleus sampling parameter # Context window management MAX_CONTEXT_TOKENS = 7500 # Maximum context for models with 8k window CHUNK_SIZE = 2000 # Tokens per chunk (used by TextPreprocessor.chunk_text_by_tokens) CHUNK_OVERLAP = 200 # Token overlap between chunks # ==================== Embedding Models ==================== # Sentence Transformers models SENTENCE_TRANSFORMER_MODEL = 'all-MiniLM-L6-v2' # For DPR CROSS_ENCODER_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # For re-ranking # CLIP model CLIP_MODEL = "ViT-L/14" # Options: ViT-B/32, ViT-L/14, RN50 # ==================== Search Parameters ==================== # BM25 parameters BM25_K1 = 1.5 # Term frequency saturation parameter BM25_B = 0.75 # Length normalization parameter # Hybrid search DEFAULT_HYBRID_ALPHA = 0.5 # Weight for BM25 (1-alpha for semantic) # Re-ranking RERANK_MULTIPLIER = 2 # Retrieve this many times top_k for re-ranking MIN_RELEVANCE_SCORE = 0.3 # Minimum score threshold # ==================== Directory Structure ==================== # Project directories PROJECT_ROOT = Path(__file__).parent DATA_DIR = PROJECT_ROOT / "data" EMBEDDINGS_DIR = PROJECT_ROOT / "embeddings" GRAPH_DIR = PROJECT_ROOT / "graph" METADATA_DIR = PROJECT_ROOT / "metadata" IMAGES_DIR = DATA_DIR / "images" # File paths VANILLA_FAISS_INDEX = EMBEDDINGS_DIR / "vanilla_faiss.index" VANILLA_METADATA = EMBEDDINGS_DIR / "vanilla_metadata.pkl" DPR_FAISS_INDEX = EMBEDDINGS_DIR / "dpr_faiss.index" DPR_METADATA = EMBEDDINGS_DIR / "dpr_metadata.pkl" BM25_INDEX = EMBEDDINGS_DIR / "bm25_index.pkl" CONTEXT_DOCS = EMBEDDINGS_DIR / "context_stuffing_docs.pkl" GRAPH_FILE = GRAPH_DIR / "graph.gml" IMAGES_DB = METADATA_DIR / "images.db" CHROMA_PATH = EMBEDDINGS_DIR / "chroma" # ==================== Batch Processing ==================== EMBEDDING_BATCH_SIZE = 100 # Batch size for OpenAI embeddings PROCESSING_BATCH_SIZE = 50 # Documents to process at once # ==================== UI Configuration ==================== # Streamlit settings MAX_CHAT_HISTORY = 5 # Maximum chat messages to keep EXAMPLE_QUESTIONS = [ "What are general machine guarding requirements?", "How do I perform lockout/tagout?", "What safety measures are needed for robotic systems?", "Explain the difference between guards and devices in machine safety.", "What are the OSHA requirements for emergency stops?", ] # Default method DEFAULT_METHOD = "graph" # Method descriptions for UI METHOD_DESCRIPTIONS = { 'graph': "Graph-based RAG using NetworkX with relationship-aware retrieval", 'vanilla': "Standard vector search with FAISS and OpenAI embeddings", 'dpr': "Dense Passage Retrieval with bi-encoder and cross-encoder re-ranking", 'bm25': "BM25 keyword search with neural re-ranking for exact term matching", 'context': "Context stuffing with full document loading and heuristic selection", 'vision': "Vision-based search using GPT-5 Vision for image analysis and classification" } # ==================== Document Processing ==================== # Document types SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.html'] IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.bmp', '.gif'] # Text splitting MARKDOWN_HEADER_LEVEL = 3 # Split by this header level (###) MAX_SECTIONS_PER_DOC = 500 # Maximum sections to extract from a document # ==================== Logging ==================== LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") # DEBUG, INFO, WARNING, ERROR LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" # ==================== Performance ==================== # Device configuration import torch DEVICE = "cuda" if torch.cuda.is_available() else "cpu" NUM_WORKERS = 4 # Parallel processing workers # Cache settings ENABLE_CACHE = True CACHE_TTL = 3600 # Cache time-to-live in seconds # ==================== Safety & Validation ==================== # Input validation MAX_QUESTION_LENGTH = 1000 # Maximum characters in a question MAX_IMAGE_SIZE_MB = 10 # Maximum image file size # Rate limiting (if needed) RATE_LIMIT_ENABLED = False MAX_QUERIES_PER_MINUTE = 60 # ==================== Default HTML Sources ==================== DEFAULT_HTML_SOURCES = [ { "title": "NIOSH Robotics in the Workplace – Safety Overview", "url": "https://www.cdc.gov/niosh/robotics/about/", "source": "NIOSH", "year": 2024, "category": "Technical Guide", "format": "HTML" } ] # ==================== Helper Functions ==================== def ensure_directories(): """Create all required directories if they don't exist.""" for directory in [DATA_DIR, EMBEDDINGS_DIR, GRAPH_DIR, METADATA_DIR, IMAGES_DIR]: directory.mkdir(parents=True, exist_ok=True) def get_model_context_length(model_name: str = OPENAI_CHAT_MODEL) -> int: """Get the context length for a given model.""" context_lengths = { "gpt-5": 128000, "gpt-4o-mini": 8192, "gpt-4o": 128000, } return context_lengths.get(model_name, 4096) def validate_api_key(): """Check if OpenAI API key is set.""" if not OPENAI_API_KEY: raise ValueError( "OpenAI API key not found. Please set OPENAI_API_KEY in .env file." ) return True # ==================== System Info ==================== def print_config(): """Print current configuration for debugging.""" print("="*50) print("RAG System Configuration") print("="*50) print(f"OpenAI Model: {OPENAI_CHAT_MODEL}") print(f"Embedding Model: {OPENAI_EMBEDDING_MODEL}") print(f"Device: {DEVICE}") print(f"Default Temperature: {DEFAULT_TEMPERATURE}") print(f"Default Top-K: {DEFAULT_TOP_K}") print(f"Chunk Size: {CHUNK_SIZE}") print(f"Project Root: {PROJECT_ROOT}") print("="*50) # Ensure directories exist on import ensure_directories()