frimelle HF Staff commited on
Commit
8d6f8e3
·
1 Parent(s): 492d2a0
Files changed (2) hide show
  1. app.py +10 -11
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,40 +4,36 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import uuid
5
  import os
6
  from datetime import datetime
7
- import spaces # required for ZeroGPU
8
 
9
- # ----- Constants -----
10
  MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
11
  with open("system_prompt.txt", "r") as f:
12
  SYSTEM_PROMPT = f.read()
13
  LOG_DIR = "chat_logs"
14
  os.makedirs(LOG_DIR, exist_ok=True)
15
 
16
- # Global vars to hold model and tokenizer
17
  model = None
18
  tokenizer = None
19
  session_id = str(uuid.uuid4())
20
 
21
- # ----- Log Chat -----
22
  def log_chat(session_id, user_msg, bot_msg):
23
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
24
  with open(os.path.join(LOG_DIR, f"{session_id}.txt"), "a") as f:
25
  f.write(f"[{timestamp}] User: {user_msg}\n")
26
  f.write(f"[{timestamp}] Bot: {bot_msg}\n\n")
27
 
28
- # ----- Required by ZeroGPU -----
29
  @spaces.GPU
30
  def load_model():
31
- global model, tokenizer
32
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
33
  model = AutoModelForCausalLM.from_pretrained(
34
  MODEL_NAME,
35
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
36
  device_map="auto"
37
  )
38
- model.eval()
39
 
40
- # ----- Inference Function -----
41
  def format_chat_prompt(history, new_input):
42
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
43
  for user_msg, bot_msg in history:
@@ -48,6 +44,12 @@ def format_chat_prompt(history, new_input):
48
 
49
  @torch.no_grad()
50
  def respond(message, history):
 
 
 
 
 
 
51
  prompt = format_chat_prompt(history, message)
52
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
53
  output = model.generate(
@@ -63,9 +65,6 @@ def respond(message, history):
63
  log_chat(session_id, message, response)
64
  return response
65
 
66
- load_model()
67
-
68
- # ----- Gradio App -----
69
  gr.ChatInterface(
70
  fn=respond,
71
  title="BoundrAI",
 
4
  import uuid
5
  import os
6
  from datetime import datetime
7
+ import spaces
8
 
 
9
  MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
10
  with open("system_prompt.txt", "r") as f:
11
  SYSTEM_PROMPT = f.read()
12
  LOG_DIR = "chat_logs"
13
  os.makedirs(LOG_DIR, exist_ok=True)
14
 
15
+ # Globals
16
  model = None
17
  tokenizer = None
18
  session_id = str(uuid.uuid4())
19
 
 
20
  def log_chat(session_id, user_msg, bot_msg):
21
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
22
  with open(os.path.join(LOG_DIR, f"{session_id}.txt"), "a") as f:
23
  f.write(f"[{timestamp}] User: {user_msg}\n")
24
  f.write(f"[{timestamp}] Bot: {bot_msg}\n\n")
25
 
26
+ # This function will be run by ZeroGPU
27
  @spaces.GPU
28
  def load_model():
 
29
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
30
  model = AutoModelForCausalLM.from_pretrained(
31
  MODEL_NAME,
32
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
33
  device_map="auto"
34
  )
35
+ return tokenizer, model
36
 
 
37
  def format_chat_prompt(history, new_input):
38
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
39
  for user_msg, bot_msg in history:
 
44
 
45
  @torch.no_grad()
46
  def respond(message, history):
47
+ global tokenizer, model
48
+
49
+ # Lazy-load model only when needed
50
+ if tokenizer is None or model is None:
51
+ tokenizer, model = load_model()
52
+
53
  prompt = format_chat_prompt(history, message)
54
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
55
  output = model.generate(
 
65
  log_chat(session_id, message, response)
66
  return response
67
 
 
 
 
68
  gr.ChatInterface(
69
  fn=respond,
70
  title="BoundrAI",
requirements.txt CHANGED
@@ -2,4 +2,5 @@ huggingface_hub==0.25.2
2
  gradio
3
  transformers
4
  torch
5
- spaces
 
 
2
  gradio
3
  transformers
4
  torch
5
+ spaces
6
+ accelerate>=0.26.0