from langchain_community.llms import HuggingFaceHub from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.chains import LLMChain from langchain.prompts import PromptTemplate import sys import os import logging # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Add project root to path for imports sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS def get_llm(): """Initialize and return the language model.""" # Set up cache directories with proper permissions cache_dir = "/app/models" if not os.path.exists(cache_dir): try: os.makedirs(cache_dir, exist_ok=True) os.chmod(cache_dir, 0o777) except Exception as e: logger.warning(f"Could not create cache directory: {e}") cache_dir = None # Never rely on API key in Spaces environment api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "") logger.info(f"Using model: {LLM_MODEL}") # Always try local pipeline first (most reliable in Spaces) try: from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM logger.info(f"Loading model {LLM_MODEL} as local pipeline") # Try multiple fallbacks with increasingly simpler models models_to_try = [ LLM_MODEL, "distilgpt2", # Smaller fallback "gpt2", # Standard fallback "EleutherAI/gpt-neo-125M" # Another option ] last_error = None for model_name in models_to_try: try: logger.info(f"Attempting to load model: {model_name}") # Try with explicit loading first try: # Set trust_remote_code to False to avoid security issues tokenizer = AutoTokenizer.from_pretrained( model_name, use_auth_token=api_key if api_key else None, trust_remote_code=False ) model = AutoModelForCausalLM.from_pretrained( model_name, use_auth_token=api_key if api_key else None, trust_remote_code=False, low_cpu_mem_usage=True # Help with memory issues ) # Create pipeline with loaded components pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_length=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, device=-1 # Use CPU ) logger.info(f"Successfully loaded model: {model_name}") return HuggingFacePipeline(pipeline=pipe) except Exception as e: logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}") last_error = e # Try direct pipeline loading pipe = pipeline( "text-generation", model=model_name, max_length=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, use_auth_token=api_key if api_key else None, device=-1 # Use CPU ) logger.info(f"Successfully loaded model: {model_name} via direct pipeline") return HuggingFacePipeline(pipeline=pipe) except Exception as e: logger.warning(f"Error loading model {model_name}: {e}") last_error = e # Continue to the next model continue # If we get here, all models failed logger.error(f"All models failed to load. Last error: {last_error}") raise last_error except Exception as e: logger.warning(f"Error creating local pipeline: {e}") # Try the HuggingFaceEndpoint as fallback try: logger.info("Attempting to use HuggingFaceEndpoint") return HuggingFaceEndpoint( repo_id="gpt2", max_length=MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, huggingfacehub_api_token=api_key ) except Exception as endpoint_error: logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}") # Last resort - mock LLM for fallback from langchain.llms.fake import FakeListLLM logger.warning("Using mock LLM as fallback") return FakeListLLM( responses=[ "I'm running in fallback mode due to model loading issues. I have limited capabilities right now.", "I can't access the language model currently. Please check the Space logs for more information.", "I'm operating with a simplified model. For better performance, try running this app locally with proper models configured." ] ) def get_embeddings(): """Initialize and return the embeddings model.""" # Set up cache directories with proper permissions cache_dir = "/app/models" if not os.path.exists(cache_dir): try: os.makedirs(cache_dir, exist_ok=True) os.chmod(cache_dir, 0o777) except Exception as e: logger.warning(f"Could not create cache directory: {e}") cache_dir = None # Try multiple models with fallbacks embedding_models_to_try = [ EMBEDDING_MODEL, "sentence-transformers/all-MiniLM-L6-v2", # Standard model "sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller model "sentence-transformers/paraphrase-albert-small-v2" # Even smaller model ] api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "") for model_name in embedding_models_to_try: # Try to use local embeddings try: logger.info(f"Loading embeddings model: {model_name}") return HuggingFaceEmbeddings( model_name=model_name, cache_folder=cache_dir, encode_kwargs={"normalize_embeddings": True}, model_kwargs={"device": "cpu"} # Ensure using CPU ) except Exception as e: logger.warning(f"Error initializing embeddings with {model_name}: {e}") # Continue to the next model # If all models fail, try with direct transformers access try: from sentence_transformers import SentenceTransformer logger.info("Loading embeddings with SentenceTransformer directly") model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") # Create a custom embeddings class class DirectEmbeddings: def embed_documents(self, texts): return model.encode(texts, normalize_embeddings=True).tolist() def embed_query(self, text): return model.encode(text, normalize_embeddings=True).tolist() return DirectEmbeddings() except Exception as e: logger.warning(f"Error with direct SentenceTransformer: {e}") # Create mock embeddings as last resort from langchain.embeddings.fake import FakeEmbeddings logger.warning("Using mock embeddings as fallback") return FakeEmbeddings(size=384) # Standard size for small embedding models def get_chat_model(): """ Create a chat-like interface using a regular LLM. This is necessary because many free HF models don't have chat interfaces. """ llm = get_llm() # Create a chat-like prompt template chat_template = """ Context: {context} Chat History: {chat_history} User: {question} AI Assistant: """ prompt = PromptTemplate( input_variables=["context", "chat_history", "question"], template=chat_template ) # Create a chain return LLMChain(llm=llm, prompt=prompt)