# gradio_blip3o_next_min.py import time from dataclasses import dataclass import torch from PIL import Image from transformers import AutoTokenizer from blip3o.model import * import gradio as gr from huggingface_hub import snapshot_download # ----------------------------- # Minimal config and runner # ----------------------------- @dataclass class T2IConfig: device: str = "cuda:0" dtype: torch.dtype = torch.bfloat16 # fixed generation config (no UI controls) scale: int = 0 seq_len: int = 729 top_p: float = 0.95 top_k: int = 1200 class TextToImageInference: def __init__(self, config: T2IConfig): self.config = config self.device = torch.device(config.device) self._load_models() def _load_models(self): model_path = snapshot_download(repo_id='BLIP3o/BLIP3o-NEXT-GRPO-Geneval-3B') self.model = blip3oQwenForInferenceLM.from_pretrained( model_path, torch_dtype=self.config.dtype ).to(self.device) self.tokenizer = AutoTokenizer.from_pretrained(model_path) if hasattr(self.tokenizer, "padding_side"): self.tokenizer.padding_side = "left" @torch.inference_mode() def generate_image(self, prompt: str) -> Image.Image: messages = [ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": f"Please generate image based on the following caption: {prompt}", }, ] input_text = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) input_text += f"" inputs = self.tokenizer( [input_text], return_tensors="pt", padding=True, truncation=True ) _, images = self.model.generate_images( inputs.input_ids.to(self.device), inputs.attention_mask.to(self.device), max_new_tokens=self.config.seq_len, do_sample=True, top_p=self.config.top_p, top_k=self.config.top_k, ) return images[0] # Try loading once at startup for simplicity LOAD_ERROR = None inference = None try: inference = TextToImageInference(T2IConfig()) except Exception as e: LOAD_ERROR = f"❌ Failed to load model: {e}" def run_generate(prompt, progress=gr.Progress(track_tqdm=True)): t0 = time.time() if LOAD_ERROR: return None, LOAD_ERROR if not prompt or not prompt.strip(): return None, "⚠️ Please enter a prompt." try: img = inference.generate_image(prompt.strip()) return img, f"✅ Done in {time.time() - t0:.2f}s." except torch.cuda.OutOfMemoryError: if torch.cuda.is_available(): torch.cuda.empty_cache() return None, "❌ CUDA OOM. Try reducing other GPU workloads." except Exception as e: return None, f"❌ Error: {e}" with gr.Blocks(title="BLIP3o-NEXT-GRPO-Geneval — Text ➜ Image") as demo: gr.Markdown("# BLIP3o-NEXT-GRPO-Geneval — Text ➜ Image") with gr.Row(): with gr.Column(scale=3): prompt = gr.Textbox( label="Prompt", placeholder="Describe the image you want to generate...", lines=4, ) run_btn = gr.Button("Generate", variant="primary") with gr.Column(scale=4): out_img = gr.Image(label="Generated Image", format="png") status = gr.Markdown("") run_btn.click( fn=run_generate, inputs=[prompt], outputs=[out_img, status], queue=True, api_name="generate", ) if __name__ == "__main__": demo.queue().launch(share=True)