import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import torch import threading import time model_id = "lambdaindie/lambdai" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) css = """ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap'); * { font-family: 'JetBrains Mono', monospace !important; } body { background-color: #111; color: #e0e0e0; } .markdown-think { background-color: #1e1e1e; border-left: 4px solid #555; padding: 10px; margin-bottom: 8px; font-style: italic; white-space: pre-wrap; animation: pulse 1.5s infinite ease-in-out; } @keyframes pulse { 0% { opacity: 0.6; } 50% { opacity: 1.0; } 100% { opacity: 0.6; } } """ def respond(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] if system_message else [] for user, assistant in history: if user: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) thinking_prompt = messages + [{"role": "user", "content": f"{message}\n\nThink step-by-step."}] prompt = tokenizer.apply_chat_template(thinking_prompt, tokenize=False, add_generation_prompt=True) inputs = tokenizer(prompt, return_tensors="pt").to(device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) reasoning = "" yield '
Thinking...
' start = time.time() thread = threading.Thread(target=model.generate, kwargs={ "inputs": inputs["input_ids"], "max_new_tokens": max_tokens, "temperature": temperature, "top_p": top_p, "streamer": streamer, }) thread.start() for token in streamer: reasoning += token yield f'
{reasoning.strip()}
' elapsed = time.time() - start yield f"""
Pensou por {elapsed:.1f} segundos
""" # Segunda etapa: resposta final final_prompt = messages + [ {"role": "user", "content": message}, {"role": "assistant", "content": reasoning.strip()}, {"role": "user", "content": "Agora responda baseado nisso."} ] prompt2 = tokenizer.apply_chat_template(final_prompt, tokenize=False, add_generation_prompt=True) inputs2 = tokenizer(prompt2, return_tensors="pt").to(device) streamer2 = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) thread2 = threading.Thread(target=model.generate, kwargs={ "inputs": inputs2["input_ids"], "max_new_tokens": max_tokens, "temperature": temperature, "top_p": top_p, "streamer": streamer2, }) thread2.start() final_answer = "" for token in streamer2: final_answer += token yield final_answer.strip() demo = gr.ChatInterface( fn=respond, title="λambdAI", theme=gr.themes.Base(), css=css, additional_inputs=[ gr.Textbox(value="", label="System Message"), gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"), gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p") ] ) if __name__ == "__main__": demo.launch()