File size: 3,675 Bytes
828879d
 
 
 
bf7c25b
828879d
bf7c25b
828879d
 
bf7c25b
828879d
bf7c25b
828879d
 
 
bf7c25b
262ec9c
bf7c25b
262ec9c
bf7c25b
262ec9c
bf7c25b
262ec9c
bf7c25b
262ec9c
bf7c25b
828879d
 
bf7c25b
828879d
 
c42c1be
bf7c25b
828879d
 
 
 
c605755
828879d
bf7c25b
828879d
bf7c25b
828879d
 
 
bf7c25b
d9cb3f7
828879d
bf7c25b
828879d
 
 
 
 
bf7c25b
c42c1be
262ec9c
 
828879d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c3c899
828879d
 
 
 
 
 
c605755
bf7c25b
828879d
bf7c25b
 
 
 
 
 
 
 
 
 
 
828879d
 
bf7c25b
828879d
 
bf7c25b
 
828879d
 
 
c42c1be
828879d
 
 
262ec9c
828879d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download

# --- CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"

print(f"βš™οΈ Setting up environment for {MODEL_ID}...")

# Global Variables (Cache)
model = None
tokenizer = None

# --- EXPLICIT DOWNLOAD FUNCTION ---
def download_model_first():
    print("⏳ Starting preventive weight download (This will take time)...")
    try:
        # Downloads files to Space cache WITHOUT using GPU time
        snapshot_download(repo_id=MODEL_ID)
        print("βœ… Download complete! Files are cached.")
    except Exception as e:
        print(f"⚠️ Warning: Download failed or already exists. Error: {e}")

def load_model():
    global model, tokenizer
    if model is None:
        print(f"πŸ”₯ Loading model into VRAM...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
            
            # Loads the previously downloaded files
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                device_map="auto", 
                trust_remote_code=True,
                torch_dtype=torch.float16
            )
            print("βœ… Qwen 72B is ready!")
        except Exception as e:
            print(f"❌ Critical error loading the model: {e}")
            raise e
    return model, tokenizer

# --- GENERATION FUNCTION (ZEROGPU) ---
@spaces.GPU(duration=150) 
def generate(message, history, system_prompt, temperature, max_tokens):
    model, tokenizer = load_model()
    
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
        
    # Manual history handling
    for turn in history:
        if turn[0]: messages.append({"role": "user", "content": turn[0]})
        if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
    
    messages.append({"role": "user", "content": message})

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        do_sample=True,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1
    )
    
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response

# --- INTERFACE ---
with gr.Blocks() as demo:
    gr.Markdown(f"# Qwen 72B ZeroGPU Test")
    
    # Aviso solicitado
    gr.Markdown(
        """
        ### ⚠️ WARNING: Large Model Inference Test
        **This model (Qwen 72B) is extremely large.** 
        *   **Loading time:** There may be a massive delay during the first initialization.
        *   **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
        """
    )
    
    with gr.Accordion("βš™οΈ Settings", open=False):
        sys_prompt = gr.Textbox(
            label="System Prompt", 
            value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
            lines=2
        )
        temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
        tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")

    chat = gr.ChatInterface(
        fn=generate,
        additional_inputs=[sys_prompt, temp, tokens]
    )

if __name__ == "__main__":
    download_model_first()
    demo.launch()