Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
# Get the HF token from environment | |
hf_token = os.getenv("HUGGINGFACE_TOKEN") | |
# Your fine-tuned model | |
model_id = "alphaoumardev/Llama3-8B-noryu-instruct" | |
# Authenticate with token when loading tokenizer/model | |
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token) | |
model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_token) | |
model.eval() | |
# Device setup | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
def chat(user_input, history=[]): | |
history.append({"role": "user", "content": user_input}) | |
# Format the prompt | |
prompt = "" | |
for turn in history: | |
role = turn["role"] | |
content = turn["content"] | |
prompt += f"{role}: {content}\n" | |
prompt += "assistant:" | |
# Tokenize and generate | |
inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=200, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
assistant_reply = output_text.split("assistant:")[-1].strip() | |
history.append({"role": "assistant", "content": assistant_reply}) | |
# Gradio expects tuple list format for Chatbot display | |
chat_history = [(h["content"], history[i + 1]["content"]) for i, h in enumerate(history[:-1]) if h["role"] == "user"] | |
return chat_history, history | |
# Gradio Blocks UI | |
with gr.Blocks() as demo: | |
chatbot = gr.Chatbot() | |
state = gr.State([]) # memory of the conversation | |
txt = gr.Textbox(show_label=False, placeholder="Type your message...") | |
txt.submit(chat, [txt, state], [chatbot, state]) | |
demo.launch() | |