from transformers import AutoTokenizer, AutoModelForCausalLM # Load CPU-optimized model tokenizer = AutoTokenizer.from_pretrained("distilgpt2") model = AutoModelForCausalLM.from_pretrained("distilgpt2") def generate_answer(context, question, max_new_tokens=100): """Generate answer with CPU optimizations""" # Create concise prompt prompt = f"""Based on the context, answer the question conversationally. Context: {context[:1000]} Question: {question} Answer:""" # Tokenize with truncation inputs = tokenizer( prompt, return_tensors="pt", max_length=512, truncation=True ) # Generate with CPU-optimized settings outputs = model.generate( inputs.input_ids, max_new_tokens=max_new_tokens, num_beams=1, # Faster than beam search do_sample=True, # More natural responses temperature=0.7, # Balance creativity/focus top_k=40, # Focus on likely tokens top_p=0.9, # Nucleus sampling pad_token_id=tokenizer.eos_token_id, early_stopping=True ) # Extract only the new text full_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return full_text.split("Answer:")[-1].strip()