import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer import gradio as gr model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) def generate_response(message, history, system_prompt, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_prompt}] for user_msg, bot_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if bot_msg: messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device) outputs = model.generate( input_ids=input_ids, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, streamer=streamer ) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return output_text.split(system_prompt)[-1].strip() chat = gr.ChatInterface( fn=generate_response, additional_inputs=[ gr.Textbox(label="System Prompt", value="Eres una mentora empática y reflexiva, especializada en acompañar a empleados en su camino hacia el bienestar laboral."), gr.Slider(label="Max tokens", minimum=64, maximum=1024, value=512), gr.Slider(label="Temperature", minimum=0.1, maximum=1.5, value=0.7), gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.95), ], ) if __name__ == "__main__": chat.launch()