Test-1 / app.py
RockStarLifeFR's picture
TinyLlama-1.1B-Chat
28fd5b9
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
# Настройка 4-bit квантизации
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True
)
# Загружаем модель и токенизатор
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quant_config,
device_map="auto",
low_cpu_mem_usage=True
)
def chat(message, history):
messages = [{"role": "system", "content": "You are a friendly Chatbot."}]
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(
inputs,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.95
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
chatbot = gr.ChatInterface(
fn=chat,
title="TinyLlama 1.1B Chatbot",
description="Chat with TinyLlama-1.1B-Chat-v1.0 (4-bit quantized)"
)
if __name__ == "__main__":
chatbot.launch()