import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download GGUF model from your repo model_path = hf_hub_download( repo_id="Phonepadith/aidc-llm-laos-10k-gemma-3-12b-it-v2", filename="aidc-llm-laos-10k-gemma-3-12b-it-v2-Q8.gguf" ) # Load the model llm = Llama( model_path=model_path, n_gpu_layers=40, # adjust based on GPU VRAM n_ctx=4096, n_threads=8 ) def chat(message, history): prompt = "" for user, bot in history: prompt += f"User: {user}\nAssistant: {bot}\n" prompt += f"User: {message}\nAssistant:" output = llm( prompt, max_tokens=512, temperature=0.7, stop=["User:", "Assistant:"] ) reply = output["choices"][0]["text"].strip() return reply demo = gr.ChatInterface(chat, title="Laos GGUF Chatbot") if __name__ == "__main__": demo.launch()