mariusjabami commited on
Commit
3b6f0da
·
verified ·
1 Parent(s): 98ec212

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -60
app.py CHANGED
@@ -1,76 +1,44 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
- import threading
5
 
6
- # Detectar dispositivo automaticamente
7
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
-
9
- # Inicializar o modelo e o tokenizer
10
  model_name = "lambdaindie/lambda-1v-1B"
11
- model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
12
- tokenizer = AutoTokenizer.from_pretrained(model_name)
13
-
14
- stop_flag = {"stop": False}
15
 
16
- def respond(prompt, history):
17
- stop_flag["stop"] = False
 
 
 
 
 
18
 
19
- full_prompt = f"\nThink a bit step-by-step before answering. \nQuestion: {prompt} \nAnswer:"
20
- inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
21
 
22
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
 
 
23
 
24
- generation_thread = threading.Thread(
25
- target=model.generate,
26
- kwargs={
27
- "input_ids": inputs["input_ids"],
28
- "attention_mask": inputs["attention_mask"],
29
- "max_new_tokens": 512,
30
- "do_sample": True,
31
- "temperature": 0.7,
32
- "top_p": 0.9,
33
- "pad_token_id": tokenizer.eos_token_id,
34
- "streamer": streamer,
35
- }
36
  )
37
- generation_thread.start()
38
-
39
- reasoning = ""
40
- for new_text in streamer:
41
- if stop_flag["stop"]:
42
- return "", history
43
- reasoning += new_text
44
- yield "", history[:-1] + [(prompt, f"<div class='final-answer'>{reasoning}</div>")]
45
-
46
- def stop_generation():
47
- stop_flag["stop"] = True
48
 
49
  with gr.Blocks(css="""
50
- #chatbot, .gr-markdown, .gr-button, .gr-textbox {
51
- font-family: 'JetBrains Mono', monospace !important;
52
- font-size: 11px !important;
53
- }
54
- .final-answer {
55
- background-color: #1e1e1e;
56
- color: #ffffff;
57
- padding: 10px;
58
- border-left: 4px solid #4caf50;
59
  font-family: 'JetBrains Mono', monospace !important;
60
- white-space: pre-wrap;
61
  font-size: 11px !important;
62
  }
63
  """) as demo:
64
- gr.Markdown("## λambdAI — Reasoning Chat")
65
-
66
- chatbot = gr.Chatbot(elem_id="chatbot")
67
- with gr.Row():
68
- txt = gr.Textbox(placeholder="Digite sua pergunta...", show_label=False)
69
- send_btn = gr.Button("Enviar")
70
- stop_btn = gr.Button("Parar")
71
 
72
- send_btn.click(respond, [txt, chatbot], [txt, chatbot])
73
- txt.submit(respond, [txt, chatbot], [txt, chatbot])
74
- stop_btn.click(stop_generation, None, None)
75
 
76
- demo.launch(share=True)
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
3
 
 
 
 
 
4
  model_name = "lambdaindie/lambda-1v-1B"
 
 
 
 
5
 
6
+ model = AutoModelForCausalLM.from_pretrained(
7
+ model_name,
8
+ torch_dtype="float32",
9
+ low_cpu_mem_usage=True,
10
+ device_map="auto"
11
+ )
12
+ model.eval()
13
 
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
 
15
 
16
+ def respond(prompt):
17
+ full_prompt = f"Think step-by-step.\nQuestion: {prompt}\nAnswer:"
18
+ inputs = tokenizer(full_prompt, return_tensors="pt", return_attention_mask=False)
19
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
20
 
21
+ output = model.generate(
22
+ **inputs,
23
+ max_new_tokens=128,
24
+ do_sample=False, # greedy, menos RAM
25
+ pad_token_id=tokenizer.eos_token_id,
 
 
 
 
 
 
 
26
  )
27
+ answer = tokenizer.decode(output[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
28
+ return answer.strip()
 
 
 
 
 
 
 
 
 
29
 
30
  with gr.Blocks(css="""
31
+ .gr-button, .gr-textbox {
 
 
 
 
 
 
 
 
32
  font-family: 'JetBrains Mono', monospace !important;
 
33
  font-size: 11px !important;
34
  }
35
  """) as demo:
36
+ gr.Markdown("## λambdAI — Light CPU Reasoning")
37
+ txt = gr.Textbox(placeholder="Digite sua pergunta...", show_label=False)
38
+ output = gr.Textbox(label="Resposta", lines=6)
39
+ btn = gr.Button("Enviar")
 
 
 
40
 
41
+ btn.click(respond, txt, output)
42
+ txt.submit(respond, txt, output)
 
43
 
44
+ demo.launch(share=True)