import gradio as gr from huggingface_hub import InferenceClient client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") def respond(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] for user_msg, bot_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if bot_msg: messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response with gr.Blocks() as demo: chatbot = gr.Chatbot(label="Chat", show_copy_button=True, type="messages") system_msg = gr.Textbox(value="You are a helpful assistant.", label="System Message") max_tokens = gr.Slider(1, 2048, value=512, label="Max Tokens") temperature = gr.Slider(0.1, 4.0, value=0.7, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.95, label="Top-p") msg_input = gr.Textbox(label="Message", placeholder="Ask me anything...") def chat_fn(msg, history, system_msg, max_tokens, temperature, top_p): return respond(msg, history, system_msg, max_tokens, temperature, top_p) msg_input.submit(chat_fn, [msg_input, chatbot, system_msg, max_tokens, temperature, top_p], chatbot) if __name__ == "__main__": demo.launch()