import gradio as gr from ctransformers import AutoModelForCausalLM import time llm = AutoModelForCausalLM.from_pretrained( "TheBloke/WizardCoder-Python-7B-V1.0-GGUF", model_file="wizardcoder-python-7b-v1.0.Q4_K_M.gguf", model_type="llama", max_new_tokens=512, temperature=0.7, top_p=0.9, stream=True ) def generate_response(message, history): prompt = "" for user, bot in history: prompt += f": {user}\n: {bot}\n" prompt += f": {message}\n:" history.append([message, ""]) response = "" for chunk in llm(prompt): response += chunk history[-1][1] = response time.sleep(0.01) yield history with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="Ask coding questions...", label="Your Message") clear = gr.Button("Clear") msg.submit(generate_response, [msg, chatbot], chatbot) clear.click(lambda: [], None, chatbot) demo.launch()