import spaces import torch import gradio as gr from PIL import Image from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from functools import lru_cache MODEL_ID = "unsloth/Qwen2.5-VL-3B-Instruct" @lru_cache(maxsize=1) def _load_model(): """Load and cache the model and processor inside GPU worker.""" model = Qwen2_5_VLForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 ).to("cuda") adapter_path = "thangvip/qwen-2.5-vl-3b-lora-brainrot-new" model.load_adapter(adapter_path) processor = AutoProcessor.from_pretrained(MODEL_ID) return model, processor @spaces.GPU def gpu_inference(image_path: str, prompt: str) -> str: """Perform inference entirely in GPU subprocess.""" model, processor = _load_model() # Load and preprocess image image = Image.open(image_path).convert("RGB") if image.width > 512: ratio = image.height / image.width image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS) # Build conversation system_msg = ( "You are BrainRot Bot.\n" ) conversation = [ {"role": "system", "content": [{"type": "text", "text": system_msg}]}, {"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt} ]} ] # Tokenize, generate, decode chat_input = processor.apply_chat_template( conversation, tokenize=False, add_generation_prompt=True ) inputs = processor(text=[chat_input], images=[image], return_tensors="pt").to("cuda") output_ids = model.generate(**inputs, max_new_tokens=1024) decoded = processor.batch_decode( output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] # Extract assistant portion return decoded.split("assistant", 1)[-1].strip().lstrip(":").strip() # Message handling def add_message(history, user_input): if history is None: history = [] for f in user_input.get("files", []): history.append({"role": "user", "content": (f,)}) text = user_input.get("text", "") if text: history.append({"role": "user", "content": text}) return history, gr.MultimodalTextbox(value=None) def inference_interface(history): if not history: return history, gr.MultimodalTextbox(value=None) # Last user text user_text = next( (m["content"] for m in reversed(history) if m["role"] == "user" and isinstance(m["content"], str)), None ) if user_text is None: return history, gr.MultimodalTextbox(value=None) # Last user image image_path = next( (m["content"][0] for m in reversed(history) if m["role"] == "user" and isinstance(m["content"], tuple)), None ) if image_path is None: return history, gr.MultimodalTextbox(value=None) # GPU inference reply = gpu_inference(image_path, user_text) history.append({"role": "assistant", "content": reply}) return history, gr.MultimodalTextbox(value=None) def build_demo(): with gr.Blocks() as demo: gr.Markdown("# qwen-2.5-vl-3b-lora-brr\n Ask me anything about brainrot meme") chatbot = gr.Chatbot([], type="messages", label="Conversation") chat_input = gr.MultimodalTextbox( interactive=True, file_types=["image"], placeholder="Enter text and upload an image.", show_label=True ) submit_evt = chat_input.submit( add_message, [chatbot, chat_input], [chatbot, chat_input] ) submit_evt.then( inference_interface, [chatbot], [chatbot, chat_input] ) with gr.Row(): send_btn = gr.Button("Send") clear_btn = gr.ClearButton([chatbot, chat_input]) send_click = send_btn.click( add_message, [chatbot, chat_input], [chatbot, chat_input] ) send_click.then( inference_interface, [chatbot], [chatbot, chat_input] ) return demo if __name__ == "__main__": demo = build_demo() demo.launch(share=True)