""" TinyGPT2 Model Wrapper for easy integration (CPU-friendly) """ from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") class TinyGPT2Model: """ Wrapper for sshleifer/tiny-gpt2 model with caching and optimization Suitable for CPU-only Hugging Face Spaces """ _instance = None _model = None _tokenizer = None def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if TinyGPT2Model._model is None: self._initialize_model() def _initialize_model(self): """Initialize Tiny-GPT2 model""" print("Loading TinyGPT2 model...") model_id = "sshleifer/tiny-gpt2" # Load tokenizer (no need for token argument, model is public) TinyGPT2Model._tokenizer = AutoTokenizer.from_pretrained(model_id,token=HUGGINGFACE_TOKEN) # Load model (no quantization, pure CPU) TinyGPT2Model._model = AutoModelForCausalLM.from_pretrained( model_id,token=HUGGINGFACE_TOKEN, torch_dtype=torch.float32 # Safe for CPU only ) print("TinyGPT2 model loaded successfully!") def generate( self, prompt: str, max_length: int = 64, temperature: float = 0.7, top_p: float = 0.95 ) -> str: """Generate response from TinyGPT2""" # For TinyGPT2, no special prompt formatting needed formatted_prompt = prompt # Tokenize inputs = TinyGPT2Model._tokenizer( formatted_prompt, return_tensors="pt", truncation=True, max_length=256 ) # Move to CPU (optional, for explicitness) inputs = {k: v.cpu() for k, v in inputs.items()} # Generate on CPU with torch.no_grad(): outputs = TinyGPT2Model._model.generate( **inputs, max_new_tokens=max_length, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=TinyGPT2Model._tokenizer.eos_token_id ) # Decode only the newly generated tokens (after the prompt) response = TinyGPT2Model._tokenizer.decode( outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True ) return response.strip() def generate_embedding(self, text: str) -> torch.Tensor: """Generate embeddings for text using last hidden state""" inputs = TinyGPT2Model._tokenizer( text, return_tensors="pt", truncation=True, max_length=256 ) inputs = {k: v.cpu() for k, v in inputs.items()} with torch.no_grad(): outputs = TinyGPT2Model._model(**inputs, output_hidden_states=True) embeddings = outputs.hidden_states[-1].mean(dim=1) return embeddings