Spaces:

p3rc03
/

2B

Running

File size: 8,637 Bytes

from langchain_community.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import sys
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Add project root to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS

def get_llm():
    """Initialize and return the language model."""
    # Set up cache directories with proper permissions
    cache_dir = "/app/models"
    if not os.path.exists(cache_dir):
        try:
            os.makedirs(cache_dir, exist_ok=True)
            os.chmod(cache_dir, 0o777)
        except Exception as e:
            logger.warning(f"Could not create cache directory: {e}")
            cache_dir = None
            
    # Never rely on API key in Spaces environment
    api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
    logger.info(f"Using model: {LLM_MODEL}")
    
    # Always try local pipeline first (most reliable in Spaces)
    try:
        from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
        
        logger.info(f"Loading model {LLM_MODEL} as local pipeline")
        
        # Try multiple fallbacks with increasingly simpler models
        models_to_try = [
            LLM_MODEL,
            "distilgpt2",  # Smaller fallback
            "gpt2",        # Standard fallback
            "EleutherAI/gpt-neo-125M"  # Another option
        ]
        
        last_error = None
        
        for model_name in models_to_try:
            try:
                logger.info(f"Attempting to load model: {model_name}")
                
                # Try with explicit loading first
                try:
                    # Set trust_remote_code to False to avoid security issues
                    tokenizer = AutoTokenizer.from_pretrained(
                        model_name, 
                        use_auth_token=api_key if api_key else None,
                        trust_remote_code=False
                    )
                    model = AutoModelForCausalLM.from_pretrained(
                        model_name, 
                        use_auth_token=api_key if api_key else None,
                        trust_remote_code=False,
                        low_cpu_mem_usage=True  # Help with memory issues
                    )
                    
                    # Create pipeline with loaded components
                    pipe = pipeline(
                        "text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_length=MAX_TOKENS,
                        temperature=DEFAULT_TEMPERATURE,
                        device=-1  # Use CPU
                    )
                    
                    logger.info(f"Successfully loaded model: {model_name}")
                    return HuggingFacePipeline(pipeline=pipe)
                except Exception as e:
                    logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}")
                    last_error = e
                    
                    # Try direct pipeline loading
                    pipe = pipeline(
                        "text-generation",
                        model=model_name,
                        max_length=MAX_TOKENS,
                        temperature=DEFAULT_TEMPERATURE,
                        use_auth_token=api_key if api_key else None,
                        device=-1  # Use CPU
                    )
                    
                    logger.info(f"Successfully loaded model: {model_name} via direct pipeline")
                    return HuggingFacePipeline(pipeline=pipe)
                
            except Exception as e:
                logger.warning(f"Error loading model {model_name}: {e}")
                last_error = e
                # Continue to the next model
                continue
        
        # If we get here, all models failed
        logger.error(f"All models failed to load. Last error: {last_error}")
        raise last_error
            
    except Exception as e:
        logger.warning(f"Error creating local pipeline: {e}")
        
        # Try the HuggingFaceEndpoint as fallback
        try:
            logger.info("Attempting to use HuggingFaceEndpoint")
            return HuggingFaceEndpoint(
                repo_id="gpt2",
                max_length=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE,
                huggingfacehub_api_token=api_key
            )
        except Exception as endpoint_error:
            logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}")
        
        # Last resort - mock LLM for fallback
        from langchain.llms.fake import FakeListLLM
        logger.warning("Using mock LLM as fallback")
        return FakeListLLM(
            responses=[
                "I'm running in fallback mode due to model loading issues. I have limited capabilities right now.",
                "I can't access the language model currently. Please check the Space logs for more information.",
                "I'm operating with a simplified model. For better performance, try running this app locally with proper models configured."
            ]
        )

def get_embeddings():
    """Initialize and return the embeddings model."""
    # Set up cache directories with proper permissions
    cache_dir = "/app/models"
    if not os.path.exists(cache_dir):
        try:
            os.makedirs(cache_dir, exist_ok=True)
            os.chmod(cache_dir, 0o777)
        except Exception as e:
            logger.warning(f"Could not create cache directory: {e}")
            cache_dir = None
    
    # Try multiple models with fallbacks
    embedding_models_to_try = [
        EMBEDDING_MODEL,
        "sentence-transformers/all-MiniLM-L6-v2",  # Standard model
        "sentence-transformers/paraphrase-MiniLM-L3-v2",  # Smaller model
        "sentence-transformers/paraphrase-albert-small-v2"  # Even smaller model
    ]
    
    api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
    
    for model_name in embedding_models_to_try:
        # Try to use local embeddings
        try:
            logger.info(f"Loading embeddings model: {model_name}")
            return HuggingFaceEmbeddings(
                model_name=model_name,
                cache_folder=cache_dir,
                encode_kwargs={"normalize_embeddings": True},
                model_kwargs={"device": "cpu"}  # Ensure using CPU
            )
        except Exception as e:
            logger.warning(f"Error initializing embeddings with {model_name}: {e}")
            # Continue to the next model
    
    # If all models fail, try with direct transformers access
    try:
        from sentence_transformers import SentenceTransformer
        logger.info("Loading embeddings with SentenceTransformer directly")
        model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        
        # Create a custom embeddings class
        class DirectEmbeddings:
            def embed_documents(self, texts):
                return model.encode(texts, normalize_embeddings=True).tolist()
            
            def embed_query(self, text):
                return model.encode(text, normalize_embeddings=True).tolist()
        
        return DirectEmbeddings()
    except Exception as e:
        logger.warning(f"Error with direct SentenceTransformer: {e}")
    
    # Create mock embeddings as last resort
    from langchain.embeddings.fake import FakeEmbeddings
    logger.warning("Using mock embeddings as fallback")
    return FakeEmbeddings(size=384)  # Standard size for small embedding models

def get_chat_model():
    """
    Create a chat-like interface using a regular LLM.
    This is necessary because many free HF models don't have chat interfaces.
    """
    llm = get_llm()
    
    # Create a chat-like prompt template
    chat_template = """
    Context: {context}
    
    Chat History:
    {chat_history}
    
    User: {question}
    AI Assistant:
    """
    
    prompt = PromptTemplate(
        input_variables=["context", "chat_history", "question"],
        template=chat_template
    )
    
    # Create a chain
    return LLMChain(llm=llm, prompt=prompt)