File size: 8,637 Bytes
2a735cc
207d24c
a33458e
 
 
 
 
403ced7
 
 
 
 
a33458e
 
 
 
 
 
 
31cd25b
 
 
 
 
 
 
403ced7
31cd25b
 
403ced7
 
 
a33458e
403ced7
31cd25b
403ced7
 
 
207d24c
f8ed285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207d24c
31cd25b
403ced7
31cd25b
f8ed285
 
 
 
 
 
 
 
 
 
 
 
403ced7
31cd25b
403ced7
31cd25b
207d24c
403ced7
 
 
207d24c
31cd25b
a33458e
 
 
8faa239
 
 
 
 
 
 
403ced7
8faa239
 
f8ed285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31cd25b
f8ed285
 
 
31cd25b
f8ed285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a33458e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
from langchain_community.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import sys
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Add project root to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from app.config import HF_API_KEY, LLM_MODEL, EMBEDDING_MODEL, DEFAULT_TEMPERATURE, MAX_TOKENS

def get_llm():
    """Initialize and return the language model."""
    # Set up cache directories with proper permissions
    cache_dir = "/app/models"
    if not os.path.exists(cache_dir):
        try:
            os.makedirs(cache_dir, exist_ok=True)
            os.chmod(cache_dir, 0o777)
        except Exception as e:
            logger.warning(f"Could not create cache directory: {e}")
            cache_dir = None
            
    # Never rely on API key in Spaces environment
    api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
    logger.info(f"Using model: {LLM_MODEL}")
    
    # Always try local pipeline first (most reliable in Spaces)
    try:
        from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
        
        logger.info(f"Loading model {LLM_MODEL} as local pipeline")
        
        # Try multiple fallbacks with increasingly simpler models
        models_to_try = [
            LLM_MODEL,
            "distilgpt2",  # Smaller fallback
            "gpt2",        # Standard fallback
            "EleutherAI/gpt-neo-125M"  # Another option
        ]
        
        last_error = None
        
        for model_name in models_to_try:
            try:
                logger.info(f"Attempting to load model: {model_name}")
                
                # Try with explicit loading first
                try:
                    # Set trust_remote_code to False to avoid security issues
                    tokenizer = AutoTokenizer.from_pretrained(
                        model_name, 
                        use_auth_token=api_key if api_key else None,
                        trust_remote_code=False
                    )
                    model = AutoModelForCausalLM.from_pretrained(
                        model_name, 
                        use_auth_token=api_key if api_key else None,
                        trust_remote_code=False,
                        low_cpu_mem_usage=True  # Help with memory issues
                    )
                    
                    # Create pipeline with loaded components
                    pipe = pipeline(
                        "text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_length=MAX_TOKENS,
                        temperature=DEFAULT_TEMPERATURE,
                        device=-1  # Use CPU
                    )
                    
                    logger.info(f"Successfully loaded model: {model_name}")
                    return HuggingFacePipeline(pipeline=pipe)
                except Exception as e:
                    logger.warning(f"Error loading {model_name} with explicit model/tokenizer: {e}")
                    last_error = e
                    
                    # Try direct pipeline loading
                    pipe = pipeline(
                        "text-generation",
                        model=model_name,
                        max_length=MAX_TOKENS,
                        temperature=DEFAULT_TEMPERATURE,
                        use_auth_token=api_key if api_key else None,
                        device=-1  # Use CPU
                    )
                    
                    logger.info(f"Successfully loaded model: {model_name} via direct pipeline")
                    return HuggingFacePipeline(pipeline=pipe)
                
            except Exception as e:
                logger.warning(f"Error loading model {model_name}: {e}")
                last_error = e
                # Continue to the next model
                continue
        
        # If we get here, all models failed
        logger.error(f"All models failed to load. Last error: {last_error}")
        raise last_error
            
    except Exception as e:
        logger.warning(f"Error creating local pipeline: {e}")
        
        # Try the HuggingFaceEndpoint as fallback
        try:
            logger.info("Attempting to use HuggingFaceEndpoint")
            return HuggingFaceEndpoint(
                repo_id="gpt2",
                max_length=MAX_TOKENS,
                temperature=DEFAULT_TEMPERATURE,
                huggingfacehub_api_token=api_key
            )
        except Exception as endpoint_error:
            logger.warning(f"HuggingFaceEndpoint failed: {endpoint_error}")
        
        # Last resort - mock LLM for fallback
        from langchain.llms.fake import FakeListLLM
        logger.warning("Using mock LLM as fallback")
        return FakeListLLM(
            responses=[
                "I'm running in fallback mode due to model loading issues. I have limited capabilities right now.",
                "I can't access the language model currently. Please check the Space logs for more information.",
                "I'm operating with a simplified model. For better performance, try running this app locally with proper models configured."
            ]
        )

def get_embeddings():
    """Initialize and return the embeddings model."""
    # Set up cache directories with proper permissions
    cache_dir = "/app/models"
    if not os.path.exists(cache_dir):
        try:
            os.makedirs(cache_dir, exist_ok=True)
            os.chmod(cache_dir, 0o777)
        except Exception as e:
            logger.warning(f"Could not create cache directory: {e}")
            cache_dir = None
    
    # Try multiple models with fallbacks
    embedding_models_to_try = [
        EMBEDDING_MODEL,
        "sentence-transformers/all-MiniLM-L6-v2",  # Standard model
        "sentence-transformers/paraphrase-MiniLM-L3-v2",  # Smaller model
        "sentence-transformers/paraphrase-albert-small-v2"  # Even smaller model
    ]
    
    api_key = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") or os.getenv("HF_API_KEY", "")
    
    for model_name in embedding_models_to_try:
        # Try to use local embeddings
        try:
            logger.info(f"Loading embeddings model: {model_name}")
            return HuggingFaceEmbeddings(
                model_name=model_name,
                cache_folder=cache_dir,
                encode_kwargs={"normalize_embeddings": True},
                model_kwargs={"device": "cpu"}  # Ensure using CPU
            )
        except Exception as e:
            logger.warning(f"Error initializing embeddings with {model_name}: {e}")
            # Continue to the next model
    
    # If all models fail, try with direct transformers access
    try:
        from sentence_transformers import SentenceTransformer
        logger.info("Loading embeddings with SentenceTransformer directly")
        model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        
        # Create a custom embeddings class
        class DirectEmbeddings:
            def embed_documents(self, texts):
                return model.encode(texts, normalize_embeddings=True).tolist()
            
            def embed_query(self, text):
                return model.encode(text, normalize_embeddings=True).tolist()
        
        return DirectEmbeddings()
    except Exception as e:
        logger.warning(f"Error with direct SentenceTransformer: {e}")
    
    # Create mock embeddings as last resort
    from langchain.embeddings.fake import FakeEmbeddings
    logger.warning("Using mock embeddings as fallback")
    return FakeEmbeddings(size=384)  # Standard size for small embedding models

def get_chat_model():
    """
    Create a chat-like interface using a regular LLM.
    This is necessary because many free HF models don't have chat interfaces.
    """
    llm = get_llm()
    
    # Create a chat-like prompt template
    chat_template = """
    Context: {context}
    
    Chat History:
    {chat_history}
    
    User: {question}
    AI Assistant:
    """
    
    prompt = PromptTemplate(
        input_variables=["context", "chat_history", "question"],
        template=chat_template
    )
    
    # Create a chain
    return LLMChain(llm=llm, prompt=prompt)