mariusjabami commited on
Commit
665b7ce
·
verified ·
1 Parent(s): 4f7e40d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -63
app.py CHANGED
@@ -1,79 +1,145 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
  import torch
 
4
  import threading
 
5
 
6
- model_name = "lambdaindie/lambda-1v-1B"
7
-
8
- # Carrega modelo na CPU de forma mais leve
9
- model = AutoModelForCausalLM.from_pretrained(
10
- model_name,
11
- torch_dtype=torch.float16, # ou torch.bfloat16 se suportar
12
- low_cpu_mem_usage=True
13
- )
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
15
 
16
- stop_flag = {"stop": False}
17
-
18
- def respond(prompt, history):
19
- stop_flag["stop"] = False
20
- history = history[-3:] # Mantém apenas os últimos 3 pares
21
 
22
- full_prompt = f"\nThink a bit step-by-step before answering.\nQuestion: {prompt}\nAnswer:"
23
- inputs = tokenizer(full_prompt, return_tensors="pt")
 
24
 
25
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
26
 
27
  generation_thread = threading.Thread(
28
  target=model.generate,
29
- kwargs={
30
- "input_ids": inputs["input_ids"],
31
- "attention_mask": inputs["attention_mask"],
32
- "max_new_tokens": 512,
33
- "do_sample": True,
34
- "temperature": 0.7,
35
- "top_p": 0.9,
36
- "pad_token_id": tokenizer.eos_token_id,
37
- "streamer": streamer,
38
- }
39
  )
40
  generation_thread.start()
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  reasoning = ""
43
- for new_text in streamer:
44
- if stop_flag["stop"]:
45
- return "", history
46
- reasoning += new_text
47
- yield "", history + [(prompt, f"<div class='final-answer'>{reasoning}</div>")]
48
-
49
- def stop_generation():
50
- stop_flag["stop"] = True
51
-
52
- # Interface Gradio
53
- with gr.Blocks(css="""
54
- #chatbot, .gr-markdown, .gr-button, .gr-textbox {
55
- font-family: 'JetBrains Mono', monospace !important;
56
- font-size: 11px !important;
57
- }
58
- .final-answer {
59
- background-color: #1e1e1e;
60
- color: #ffffff;
61
- padding: 10px;
62
- border-left: 4px solid #4caf50;
63
- white-space: pre-wrap;
64
- font-size: 11px !important;
65
- }
66
- """) as demo:
67
- gr.Markdown("## λambdAI — Reasoning Chat")
68
-
69
- chatbot = gr.Chatbot(elem_id="chatbot")
70
- with gr.Row():
71
- txt = gr.Textbox(placeholder="Digite sua pergunta...", show_label=False)
72
- send_btn = gr.Button("Enviar")
73
- stop_btn = gr.Button("Parar")
74
-
75
- send_btn.click(respond, [txt, chatbot], [txt, chatbot])
76
- txt.submit(respond, [txt, chatbot], [txt, chatbot])
77
- stop_btn.click(stop_generation, None, None)
78
-
79
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
+ import time
4
  import threading
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
6
 
7
+ # === Carregar modelo local (CPU) ===
8
+ model_name = "lambdaindie/lambda-1v-1B" # troque pelo teu
 
 
 
 
 
 
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu") # <- CPU aqui
11
 
12
+ # === Streamer global para interrupção ===
13
+ stop_signal = {"stop": False}
 
 
 
14
 
15
+ def generate_stream(prompt, max_tokens=512, temperature=0.7, top_p=0.95):
16
+ stop_signal["stop"] = False
17
+ inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
18
 
19
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
20
 
21
  generation_thread = threading.Thread(
22
  target=model.generate,
23
+ kwargs=dict(
24
+ input_ids=inputs["input_ids"],
25
+ attention_mask=inputs["attention_mask"],
26
+ streamer=streamer,
27
+ max_new_tokens=max_tokens,
28
+ do_sample=True,
29
+ temperature=temperature,
30
+ top_p=top_p,
31
+ pad_token_id=tokenizer.eos_token_id,
32
+ )
33
  )
34
  generation_thread.start()
35
 
36
+ output = ""
37
+ for token in streamer:
38
+ if stop_signal["stop"]:
39
+ break
40
+ output += token
41
+ yield output.strip()
42
+
43
+ def stop_stream():
44
+ stop_signal["stop"] = True
45
+
46
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
47
+ messages = [{"role": "system", "content": system_message}] if system_message else []
48
+
49
+ for user, assistant in history[-3:]: # Limita a 3 interações passadas
50
+ if user:
51
+ messages.append({"role": "user", "content": user})
52
+ if assistant:
53
+ messages.append({"role": "assistant", "content": assistant})
54
+
55
+ thinking_prompt = messages + [{"role": "user", "content": f"{message}\n\nThink step-by-step before answering."}]
56
+ thinking_text = "\n".join([f"{m['role']}: {m['content']}" for m in thinking_prompt])
57
+
58
  reasoning = ""
59
+ yield '<div class="markdown-think">Thinking...</div>'
60
+
61
+ start = time.time()
62
+ for token in generate_stream(thinking_text, max_tokens, temperature, top_p):
63
+ reasoning = token
64
+ yield f'<div class="markdown-think">{reasoning.strip()}</div>'
65
+
66
+ elapsed = time.time() - start
67
+ yield f"""
68
+ <div style="margin-top:12px;padding:8px 12px;background-color:#222;border-left:4px solid #888;
69
+ font-family:'JetBrains Mono', monospace;color:#ccc;font-size:14px;">
70
+ Pensou por {elapsed:.1f} segundos
71
+ </div>
72
+ """
73
+
74
+ final_prompt = thinking_text + f"\n\nuser: {message}\nassistant: {reasoning.strip()}\nuser: Now answer based on your reasoning above.\nassistant:"
75
+ final_answer = ""
76
+
77
+ for token in generate_stream(final_prompt, max_tokens, temperature, top_p):
78
+ final_answer = token
79
+ yield final_answer.strip()
80
+
81
+ # === Interface ===
82
+
83
+ css = """
84
+ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono&display=swap');
85
+ * { font-family: 'JetBrains Mono', monospace !important; }
86
+ html, body, .gradio-container {
87
+ background-color: #111 !important;
88
+ color: #e0e0e0 !important;
89
+ }
90
+ textarea, input, button, select {
91
+ background-color: transparent !important;
92
+ color: #e0e0e0 !important;
93
+ border: 1px solid #444 !important;
94
+ }
95
+ .markdown-think {
96
+ background-color: #1e1e1e;
97
+ border-left: 4px solid #555;
98
+ padding: 10px;
99
+ margin-bottom: 8px;
100
+ font-style: italic;
101
+ white-space: pre-wrap;
102
+ animation: pulse 1.5s infinite ease-in-out;
103
+ }
104
+ @keyframes pulse {
105
+ 0% { opacity: 0.6; }
106
+ 50% { opacity: 1.0; }
107
+ 100% { opacity: 0.6; }
108
+ }
109
+ """
110
+
111
+ theme = gr.themes.Base(
112
+ primary_hue="gray",
113
+ font=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"]
114
+ ).set(
115
+ body_background_fill="#111",
116
+ body_text_color="#e0e0e0",
117
+ input_background_fill="#222",
118
+ input_border_color="#444",
119
+ button_primary_background_fill="#333",
120
+ button_primary_text_color="#e0e0e0",
121
+ )
122
+
123
+ chatbot = gr.ChatInterface(
124
+ fn=respond,
125
+ title="λambdAI",
126
+ css=css,
127
+ theme=theme,
128
+ additional_inputs=[
129
+ gr.Textbox(value="", label="System Message"),
130
+ gr.Slider(64, 2048, value=512, step=1, label="Max Tokens"),
131
+ gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"),
132
+ gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
133
+ ]
134
+ )
135
+
136
+ stop_btn = gr.Button("Parar Geração")
137
+ stop_btn.click(fn=stop_stream, inputs=[], outputs=[])
138
+
139
+ app = gr.Blocks()
140
+ with app:
141
+ chatbot.render()
142
+ stop_btn.render()
143
+
144
+ if __name__ == "__main__":
145
+ app.launch(share=True)