Spaces:

Madras1
/

APILARGE

Running on Zero

App Files Files Community

Madras1 commited on 12 days ago

Commit

bf7c25b

verified ·

1 Parent(s): d9cb3f7

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -25

app.py CHANGED Viewed

@@ -2,58 +2,57 @@ import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download # <--- Importante para baixar antes
-# --- CONFIGURAÇÃO DOS TITÃS ---
 MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"
-print(f"🏗️ Berta: Configurando o ambiente para o Titã {MODEL_ID}...")
-# Variáveis Globais (Cache)
 model = None
 tokenizer = None
-# --- FUNÇÃO DE DOWNLOAD EXPLÍCITO ---
 def download_model_first():
-    print("⏳ Berta: Iniciando download preventivo dos pesos (Isso vai demorar, tenha fé!)...")
     try:
-        # Isso baixa os arquivos para o cache do Space SEM usar tempo de GPU
         snapshot_download(repo_id=MODEL_ID)
-        print("✅ Download concluído! Os arquivos estão em casa.")
     except Exception as e:
-        print(f"⚠️ Aviso: O download falhou ou já existe. Erro: {e}")
-def load_titan():
     global model, tokenizer
     if model is None:
-        print(f"🔥 Berta: Carregando o modelo na VRAM H200...")
         try:
             tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-            # Aqui ele vai achar os arquivos já baixados, então será rápido!
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 device_map="auto",
                 trust_remote_code=True,
                 torch_dtype=torch.float16
             )
-            print("✅ O Titã Qwen 72B está pronto para a batalha!")
         except Exception as e:
-            print(f"❌ Erro catastrófico ao carregar o Titã: {e}")
             raise e
     return model, tokenizer
-# --- FUNÇÃO DE GERAÇÃO (ZEROGPU) ---
-# Aumentei para 300 segundos (5 minutos) para garantir que ele tenha tempo de pensar
 @spaces.GPU(duration=150)
 def generate(message, history, system_prompt, temperature, max_tokens):
-    model, tokenizer = load_titan()
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})
-    # Tratamento manual do histórico (Blindado contra erros de versão)
     for turn in history:
         if turn[0]: messages.append({"role": "user", "content": turn[0]})
         if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
@@ -83,17 +82,26 @@ def generate(message, history, system_prompt, temperature, max_tokens):
 # --- INTERFACE ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 🏛️ Templo dos Gigantes (Qwen 72B API)")
-    gr.Markdown(f"### Atendendo: Gabriel | Modelo Ativo: `{MODEL_ID}` (H200 Powered)")
-    with gr.Accordion("⚙️ Configurações do Titã", open=False):
         sys_prompt = gr.Textbox(
             label="System Prompt",
-            value="Você é um assistente de IA especialista, focado em soluções de código complexas e arquitetura de software.",
             lines=2
         )
-        temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperatura")
-        tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Máximo de Tokens")
     chat = gr.ChatInterface(
         fn=generate,
@@ -102,5 +110,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     download_model_first()
     demo.launch()

 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import snapshot_download
+# --- CONFIGURATION ---
 MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"
+print(f"⚙️ Setting up environment for {MODEL_ID}...")
+# Global Variables (Cache)
 model = None
 tokenizer = None
+# --- EXPLICIT DOWNLOAD FUNCTION ---
 def download_model_first():
+    print("⏳ Starting preventive weight download (This will take time)...")
     try:
+        # Downloads files to Space cache WITHOUT using GPU time
         snapshot_download(repo_id=MODEL_ID)
+        print("✅ Download complete! Files are cached.")
     except Exception as e:
+        print(f"⚠️ Warning: Download failed or already exists. Error: {e}")
+def load_model():
     global model, tokenizer
     if model is None:
+        print(f"🔥 Loading model into VRAM...")
         try:
             tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+            # Loads the previously downloaded files
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 device_map="auto",
                 trust_remote_code=True,
                 torch_dtype=torch.float16
             )
+            print("✅ Qwen 72B is ready!")
         except Exception as e:
+            print(f"❌ Critical error loading the model: {e}")
             raise e
     return model, tokenizer
+# --- GENERATION FUNCTION (ZEROGPU) ---
 @spaces.GPU(duration=150)
 def generate(message, history, system_prompt, temperature, max_tokens):
+    model, tokenizer = load_model()
     messages = []
     if system_prompt:
         messages.append({"role": "system", "content": system_prompt})
+    # Manual history handling
     for turn in history:
         if turn[0]: messages.append({"role": "user", "content": turn[0]})
         if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
 # --- INTERFACE ---
 with gr.Blocks() as demo:
+    gr.Markdown(f"# Qwen 72B ZeroGPU Test")
+    # Aviso solicitado
+    gr.Markdown(
+        """
+        ### ⚠️ WARNING: Large Model Inference Test
+        **This model (Qwen 72B) is extremely large.**
+        *   **Loading time:** There may be a massive delay during the first initialization.
+        *   **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
+        """
+    )
+    with gr.Accordion("⚙️ Settings", open=False):
         sys_prompt = gr.Textbox(
             label="System Prompt",
+            value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
             lines=2
         )
+        temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
+        tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")
     chat = gr.ChatInterface(
         fn=generate,
 if __name__ == "__main__":
     download_model_first()
     demo.launch()