from llama_cpp import Llama from huggingface_hub import hf_hub_download import gradio as gr # Automatically download model to local path model_path = hf_hub_download( repo_id="TheBloke/OpenHermes-2.5-Mistral-7B-GGUF", filename="openhermes-2.5-mistral-7b.Q4_K_M.gguf" ) # Load the model llm = Llama(model_path=model_path, n_ctx=2048) def chatbot_response(message, history): # Format history for chat template formatted_history = "" for user, bot in history[-4:]: # Last 4 turns formatted_history += f"<|user|>\n{user}<|end|>\n<|assistant|>\n{bot}<|end|>\n" prompt = formatted_history + f"<|user|>\n{message}<|end|>\n<|assistant|>\n" try: output = llm(prompt, max_tokens=256, stop=["<|end|>"], temperature=0.7) text = output["choices"][0]["text"].strip() return text except Exception as e: return f"Error: {e}" demo = gr.ChatInterface( fn=chatbot_response, title="🧠 Free CPU Chatbot (OpenHermes-2.5)", description="A lightweight, high-quality chatbot that runs on CPU using llama.cpp", theme="soft", examples=["What's your name?", "Tell me a joke", "What is Python used for?"] ) if __name__ == "__main__": demo.launch(share=True)