import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama import os # --- 1. Model Downloading (No changes) --- print("===== Downloading model... =====") model_path = hf_hub_download( repo_id="RichardErkhov/openai-community_-_gpt2-xl-gguf", filename="gpt2-xl.Q6_K.gguf" ) print(f"Model downloaded to: {model_path}") # --- 2. Model Loading (Optimized for HF Space CPU) --- print("===== Loading model... =====") n_threads = os.cpu_count() llm = Llama( model_path=model_path, n_ctx=2048, n_threads=n_threads, n_gpu_layers=0 ) print(f"Model loaded for CPU execution with {n_threads} threads.") # --- 3. Chat Function with Streaming (No changes) --- def chat(message, history): history_prompt = "" for user_msg, assistant_msg in history: history_prompt += f"### User:\n{user_msg}\n\n### Assistant:\n{assistant_msg}\n\n" full_prompt = f"""### System: You are Dolphin 3.0, a helpful and friendly AI assistant. {history_prompt}### User: {message} ### Assistant:""" stream = llm( full_prompt, max_tokens=1024, stop=["", "### User:", "### Assistant:"], stream=True ) partial_message = "" for output in stream: token = output['choices'][0]['text'] partial_message += token yield partial_message # --- 4. The Enhanced Chatbot UI (MAXIMUM COMPATIBILITY) --- # We are removing ALL custom button arguments to ensure this works on older Gradio versions. # Gradio will add the default 'Undo' and 'Clear' buttons for us. iface = gr.ChatInterface( fn=chat, title="🐬 Dolphin 3.0 on Hugging Face Spaces", description="A sleek, streaming chat interface running on a CPU Space.", chatbot=gr.Chatbot(height=500), textbox=gr.Textbox(placeholder="Ask me something... I'm all yours.", container=False, scale=7), theme="soft", examples=[["Hello!"], ["Write a short poem about the stars."], ["What is the capital of India?"]], cache_examples=False, ) if __name__ == "__main__": iface.launch()