Spaces:
Running
Running
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import gradio as gr | |
# Automatically download model to local path | |
model_path = hf_hub_download( | |
repo_id="TheBloke/OpenHermes-2.5-Mistral-7B-GGUF", | |
filename="openhermes-2.5-mistral-7b.Q4_K_M.gguf" | |
) | |
# Load the model | |
llm = Llama(model_path=model_path, n_ctx=2048) | |
def chatbot_response(message, history): | |
# Format history for chat template | |
formatted_history = "" | |
for user, bot in history[-4:]: # Last 4 turns | |
formatted_history += f"<|user|>\n{user}<|end|>\n<|assistant|>\n{bot}<|end|>\n" | |
prompt = formatted_history + f"<|user|>\n{message}<|end|>\n<|assistant|>\n" | |
try: | |
output = llm(prompt, max_tokens=256, stop=["<|end|>"], temperature=0.7) | |
text = output["choices"][0]["text"].strip() | |
return text | |
except Exception as e: | |
return f"Error: {e}" | |
demo = gr.ChatInterface( | |
fn=chatbot_response, | |
title="🧠 Free CPU Chatbot (OpenHermes-2.5)", | |
description="A lightweight, high-quality chatbot that runs on CPU using llama.cpp", | |
theme="soft", | |
examples=["What's your name?", "Tell me a joke", "What is Python used for?"] | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True) | |