File size: 3,675 Bytes
828879d bf7c25b 828879d bf7c25b 828879d bf7c25b 828879d bf7c25b 828879d bf7c25b 262ec9c bf7c25b 262ec9c bf7c25b 262ec9c bf7c25b 262ec9c bf7c25b 262ec9c bf7c25b 828879d bf7c25b 828879d c42c1be bf7c25b 828879d c605755 828879d bf7c25b 828879d bf7c25b 828879d bf7c25b d9cb3f7 828879d bf7c25b 828879d bf7c25b c42c1be 262ec9c 828879d 0c3c899 828879d c605755 bf7c25b 828879d bf7c25b 828879d bf7c25b 828879d bf7c25b 828879d c42c1be 828879d 262ec9c 828879d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
# --- CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4"
print(f"βοΈ Setting up environment for {MODEL_ID}...")
# Global Variables (Cache)
model = None
tokenizer = None
# --- EXPLICIT DOWNLOAD FUNCTION ---
def download_model_first():
print("β³ Starting preventive weight download (This will take time)...")
try:
# Downloads files to Space cache WITHOUT using GPU time
snapshot_download(repo_id=MODEL_ID)
print("β
Download complete! Files are cached.")
except Exception as e:
print(f"β οΈ Warning: Download failed or already exists. Error: {e}")
def load_model():
global model, tokenizer
if model is None:
print(f"π₯ Loading model into VRAM...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Loads the previously downloaded files
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.float16
)
print("β
Qwen 72B is ready!")
except Exception as e:
print(f"β Critical error loading the model: {e}")
raise e
return model, tokenizer
# --- GENERATION FUNCTION (ZEROGPU) ---
@spaces.GPU(duration=150)
def generate(message, history, system_prompt, temperature, max_tokens):
model, tokenizer = load_model()
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# Manual history handling
for turn in history:
if turn[0]: messages.append({"role": "user", "content": turn[0]})
if turn[1]: messages.append({"role": "assistant", "content": turn[1]})
messages.append({"role": "user", "content": message})
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
top_p=0.95,
top_k=40,
repetition_penalty=1.1
)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
# --- INTERFACE ---
with gr.Blocks() as demo:
gr.Markdown(f"# Qwen 72B ZeroGPU Test")
# Aviso solicitado
gr.Markdown(
"""
### β οΈ WARNING: Large Model Inference Test
**This model (Qwen 72B) is extremely large.**
* **Loading time:** There may be a massive delay during the first initialization.
* **Test Environment:** This is a stress test for running Qwen 72B inference on a single ZeroGPU Space.
"""
)
with gr.Accordion("βοΈ Settings", open=False):
sys_prompt = gr.Textbox(
label="System Prompt",
value="You are an expert AI assistant focused on complex coding solutions and software architecture.",
lines=2
)
temp = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature")
tokens = gr.Slider(minimum=256, maximum=8192, value=4096, label="Max Tokens")
chat = gr.ChatInterface(
fn=generate,
additional_inputs=[sys_prompt, temp, tokens]
)
if __name__ == "__main__":
download_model_first()
demo.launch() |