""" Configuration settings for llama.cpp in Hugging Face Space """ import os # Model configuration MODEL_CONFIG = { "n_ctx": 2048, # Context window size "n_threads": 2, # Number of threads (conservative for HF Spaces) "n_batch": 8, # Batch size for prompt processing "use_mmap": True, # Use memory mapping for model files "use_mlock": False, # Don't lock model in memory (saves RAM) "verbose": False, # Reduce logging in production } # Generation defaults GENERATION_CONFIG = { "temperature": 0.7, "top_p": 0.9, "top_k": 40, "repeat_penalty": 1.1, "stop": ["```", "\n\n\n", "Human:", "Assistant:"], } # Hugging Face Space specific settings HF_SPACE_CONFIG = { "max_memory_usage": "2GB", # Conservative memory usage "timeout_seconds": 30, # Request timeout "enable_cpu_only": True, # Force CPU inference } # Model download settings MODEL_DOWNLOAD_CONFIG = { "cache_dir": "./models", "use_auth_token": os.getenv("HF_TOKEN", None), "resume_download": True, } # Recommended small GGUF models for demonstration RECOMMENDED_MODELS = [ { "name": "Osmosis-Structure-0.6B", "repo_id": "osmosis-ai/Osmosis-Structure-0.6B", "filename": "Osmosis-Structure-0.6B-BF16.gguf", "size": "~1.2GB", "description": "Osmosis AI structure-focused model for JSON generation" }, { "name": "TinyLlama-1.1B-Chat-v1.0-GGUF", "repo_id": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", "filename": "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf", "size": "~700MB", "description": "Small, fast model good for testing" } ] def get_model_config(): """Get model configuration optimized for HF Spaces""" return MODEL_CONFIG.copy() def get_generation_config(): """Get generation configuration""" return GENERATION_CONFIG.copy() def get_recommended_model(): """Get the recommended model for this space""" return RECOMMENDED_MODELS[0] # Return TinyLlama as default