import os import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Get the HF token from environment hf_token = os.getenv("HUGGINGFACE_TOKEN") # Your fine-tuned model model_id = "alphaoumardev/Llama3-8B-noryu-instruct" # Authenticate with token when loading tokenizer/model tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token) model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_token) model.eval() # Device setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def chat(user_input, history=[]): history.append({"role": "user", "content": user_input}) # Format the prompt prompt = "" for turn in history: role = turn["role"] content = turn["content"] prompt += f"{role}: {content}\n" prompt += "assistant:" # Tokenize and generate inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) assistant_reply = output_text.split("assistant:")[-1].strip() history.append({"role": "assistant", "content": assistant_reply}) # Gradio expects tuple list format for Chatbot display chat_history = [(h["content"], history[i + 1]["content"]) for i, h in enumerate(history[:-1]) if h["role"] == "user"] return chat_history, history # Gradio Blocks UI with gr.Blocks() as demo: chatbot = gr.Chatbot() state = gr.State([]) # memory of the conversation txt = gr.Textbox(show_label=False, placeholder="Type your message...") txt.submit(chat, [txt, state], [chatbot, state]) demo.launch()