Madras1 commited on
Commit
bf7c25b
·
verified ·
1 Parent(s): d9cb3f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -25
app.py CHANGED
@@ -2,58 +2,57 @@ import gradio as gr
2
  import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
- from huggingface_hub import snapshot_download # <--- Importante para baixar antes
6
 
7
- # --- CONFIGURAÇÃO DOS TITÃS ---
8
  MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"
9
 
10
- print(f"🏗️ Berta: Configurando o ambiente para o Titã {MODEL_ID}...")
11
 
12
- # Variáveis Globais (Cache)
13
  model = None
14
  tokenizer = None
15
 
16
- # --- FUNÇÃO DE DOWNLOAD EXPLÍCITO ---
17
  def download_model_first():
18
- print("⏳ Berta: Iniciando download preventivo dos pesos (Isso vai demorar, tenha fé!)...")
19
  try:
20
- # Isso baixa os arquivos para o cache do Space SEM usar tempo de GPU
21
  snapshot_download(repo_id=MODEL_ID)
22
- print("✅ Download concluído! Os arquivos estão em casa.")
23
  except Exception as e:
24
- print(f"⚠️ Aviso: O download falhou ou já existe. Erro: {e}")
25
 
26
- def load_titan():
27
  global model, tokenizer
28
  if model is None:
29
- print(f"🔥 Berta: Carregando o modelo na VRAM H200...")
30
  try:
31
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
32
 
33
- # Aqui ele vai achar os arquivos já baixados, então será rápido!
34
  model = AutoModelForCausalLM.from_pretrained(
35
  MODEL_ID,
36
  device_map="auto",
37
  trust_remote_code=True,
38
  torch_dtype=torch.float16
39
  )
40
- print("✅ O Titã Qwen 72B está pronto para a batalha!")
41
  except Exception as e:
42
- print(f"❌ Erro catastrófico ao carregar o Titã: {e}")
43
  raise e
44
  return model, tokenizer
45
 
46
- # --- FUNÇÃO DE GERAÇÃO (ZEROGPU) ---
47
- # Aumentei para 300 segundos (5 minutos) para garantir que ele tenha tempo de pensar
48
  @spaces.GPU(duration=150)
49
  def generate(message, history, system_prompt, temperature, max_tokens):
50
- model, tokenizer = load_titan()
51
 
52
  messages = []
53
  if system_prompt:
54
  messages.append({"role": "system", "content": system_prompt})
55
 
56
- # Tratamento manual do histórico (Blindado contra erros de versão)
57
  for turn in history:
58
  if turn[0]: messages.append({"role": "user", "content": turn[0]})
59
  if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
@@ -83,17 +82,26 @@ def generate(message, history, system_prompt, temperature, max_tokens):
83
 
84
  # --- INTERFACE ---
85
  with gr.Blocks() as demo:
86
- gr.Markdown("# 🏛️ Templo dos Gigantes (Qwen 72B API)")
87
- gr.Markdown(f"### Atendendo: Gabriel | Modelo Ativo: `{MODEL_ID}` (H200 Powered)")
88
 
89
- with gr.Accordion("⚙️ Configurações do Titã", open=False):
 
 
 
 
 
 
 
 
 
 
90
  sys_prompt = gr.Textbox(
91
  label="System Prompt",
92
- value="Você é um assistente de IA especialista, focado em soluções de código complexas e arquitetura de software.",
93
  lines=2
94
  )
95
- temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperatura")
96
- tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Máximo de Tokens")
97
 
98
  chat = gr.ChatInterface(
99
  fn=generate,
@@ -102,5 +110,4 @@ with gr.Blocks() as demo:
102
 
103
  if __name__ == "__main__":
104
  download_model_first()
105
-
106
  demo.launch()
 
2
  import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from huggingface_hub import snapshot_download
6
 
7
+ # --- CONFIGURATION ---
8
  MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"
9
 
10
+ print(f"⚙️ Setting up environment for {MODEL_ID}...")
11
 
12
+ # Global Variables (Cache)
13
  model = None
14
  tokenizer = None
15
 
16
+ # --- EXPLICIT DOWNLOAD FUNCTION ---
17
  def download_model_first():
18
+ print("⏳ Starting preventive weight download (This will take time)...")
19
  try:
20
+ # Downloads files to Space cache WITHOUT using GPU time
21
  snapshot_download(repo_id=MODEL_ID)
22
+ print("✅ Download complete! Files are cached.")
23
  except Exception as e:
24
+ print(f"⚠️ Warning: Download failed or already exists. Error: {e}")
25
 
26
+ def load_model():
27
  global model, tokenizer
28
  if model is None:
29
+ print(f"🔥 Loading model into VRAM...")
30
  try:
31
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
32
 
33
+ # Loads the previously downloaded files
34
  model = AutoModelForCausalLM.from_pretrained(
35
  MODEL_ID,
36
  device_map="auto",
37
  trust_remote_code=True,
38
  torch_dtype=torch.float16
39
  )
40
+ print("✅ Qwen 72B is ready!")
41
  except Exception as e:
42
+ print(f"❌ Critical error loading the model: {e}")
43
  raise e
44
  return model, tokenizer
45
 
46
+ # --- GENERATION FUNCTION (ZEROGPU) ---
 
47
  @spaces.GPU(duration=150)
48
  def generate(message, history, system_prompt, temperature, max_tokens):
49
+ model, tokenizer = load_model()
50
 
51
  messages = []
52
  if system_prompt:
53
  messages.append({"role": "system", "content": system_prompt})
54
 
55
+ # Manual history handling
56
  for turn in history:
57
  if turn[0]: messages.append({"role": "user", "content": turn[0]})
58
  if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
 
82
 
83
  # --- INTERFACE ---
84
  with gr.Blocks() as demo:
85
+ gr.Markdown(f"# Qwen 72B ZeroGPU Test")
 
86
 
87
+ # Aviso solicitado
88
+ gr.Markdown(
89
+ """
90
+ ### ⚠️ WARNING: Large Model Inference Test
91
+ **This model (Qwen 72B) is extremely large.**
92
+ * **Loading time:** There may be a massive delay during the first initialization.
93
+ * **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
94
+ """
95
+ )
96
+
97
+ with gr.Accordion("⚙️ Settings", open=False):
98
  sys_prompt = gr.Textbox(
99
  label="System Prompt",
100
+ value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
101
  lines=2
102
  )
103
+ temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
104
+ tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")
105
 
106
  chat = gr.ChatInterface(
107
  fn=generate,
 
110
 
111
  if __name__ == "__main__":
112
  download_model_first()
 
113
  demo.launch()