import spaces import gradio as gr import os import sys from typing import List # sys.path.append(os.getcwd()) import numpy as np from PIL import Image import torch from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info from gradio_imageslider import ImageSlider print(f'torch version:{torch.__version__}') import torch.utils.checkpoint from pytorch_lightning import seed_everything from diffusers import AutoencoderKL, DDIMScheduler from diffusers.utils import check_min_version from diffusers.utils.import_utils import is_xformers_available from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor from huggingface_hub import hf_hub_download, snapshot_download from pipelines.pipeline_seesr import StableDiffusionControlNetPipeline from utils.wavelet_color_fix import wavelet_color_fix, adain_color_fix from ram.models.ram_lora import ram from ram import inference_ram as inference from torchvision import transforms from models.controlnet import ControlNetModel from models.unet_2d_condition import UNet2DConditionModel # VLM_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" # vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( # VLM_NAME, # torch_dtype="auto", # device_map="auto" # immediately dispatches layers onto available GPUs # ) # vlm_processor = AutoProcessor.from_pretrained(VLM_NAME) def _generate_vlm_prompt( vlm_model: Qwen2_5_VLForConditionalGeneration, vlm_processor: AutoProcessor, process_vision_info, pil_image: Image.Image, device: str = "cuda" ) -> str: """ Given two PIL.Image inputs: - prev_pil: the “full” image at the previous recursion. - zoomed_pil: the cropped+resized (zoom) image for this step. Returns a single “recursive_multiscale” prompt string. """ message_text = ( "The give a detailed description of this image." "describe each element with fine details." ) messages = [ {"role": "system", "content": message_text}, { "role": "user", "content": [ {"type": "image", "image": pil_image}, ], }, ] text = vlm_processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = vlm_processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(device) generated = vlm_model.generate(**inputs, max_new_tokens=128) trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated) ] out_text = vlm_processor.batch_decode( trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return out_text.strip() tensor_transforms = transforms.Compose([ transforms.ToTensor(), ]) ram_transforms = transforms.Compose([ transforms.Resize((384, 384)), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) snapshot_download( repo_id="alexnasa/SEESR", local_dir="preset/models" ) snapshot_download( repo_id="stabilityai/stable-diffusion-2-1-base", local_dir="preset/models/stable-diffusion-2-1-base" ) snapshot_download( repo_id="xinyu1205/recognize_anything_model", local_dir="preset/models/" ) # Load scheduler, tokenizer and models. pretrained_model_path = 'preset/models/stable-diffusion-2-1-base' seesr_model_path = 'preset/models/seesr' scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler") text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder") tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer") vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae") feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor") unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet") controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet") # Freeze vae and text_encoder vae.requires_grad_(False) text_encoder.requires_grad_(False) unet.requires_grad_(False) controlnet.requires_grad_(False) # unet.to("cuda") # controlnet.to("cuda") # unet.enable_xformers_memory_efficient_attention() # controlnet.enable_xformers_memory_efficient_attention() # Get the validation pipeline validation_pipeline = StableDiffusionControlNetPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, feature_extractor=None, unet=unet, controlnet=controlnet, scheduler=scheduler, safety_checker=None, requires_safety_checker=False, ) validation_pipeline._init_tiled_vae(encoder_tile_size=1024, decoder_tile_size=224) weight_dtype = torch.float16 device = "cuda" # Move text_encode and vae to gpu and cast to weight_dtype text_encoder.to(device, dtype=weight_dtype) vae.to(device, dtype=weight_dtype) unet.to(device, dtype=weight_dtype) controlnet.to(device, dtype=weight_dtype) tag_model = ram(pretrained='preset/models/ram_swin_large_14m.pth', pretrained_condition='preset/models/DAPE.pth', image_size=384, vit='swin_l') tag_model.eval() tag_model.to(device, dtype=weight_dtype) def preprocess_image(input_image: Image.Image) -> Image.Image: img = input_image.copy() img.thumbnail((512, 512), Image.Resampling.BILINEAR) return img @spaces.GPU(duration=130) def preprocess_n_magnify(input_image: Image.Image, progress=gr.Progress(track_tqdm=True),): """ Preprocess the input image and perform a single-step 4× magnification using the SeeSR pipeline. This function first resizes the input to fit within a 512×512 thumbnail, then applies the full magnification through ControlNet-guided diffusion—to produce a high-resolution, 4× upscaled image. Args: input_image (PIL.Image.Image): The source image to preprocess and magnify. Returns: tuple[PIL.Image.Image, PIL.Image.Image]: - The resized (thumbnail) version of the input. - The final 4× magnified output image. """ processed_img = preprocess_image(input_image) img, magnified_img = magnify(processed_img, progress=progress) return (img, magnified_img) @spaces.GPU() def magnify( input_image: Image.Image, user_prompt = "", positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece", negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", num_inference_steps = 50, scale_factor = 4, cfg_scale = 7.5, seed = 123, latent_tiled_size = 320, latent_tiled_overlap = 4, sample_times = 1, progress=gr.Progress(track_tqdm=True), ): process_size = 512 resize_preproc = transforms.Compose([ transforms.Resize(process_size, interpolation=transforms.InterpolationMode.BILINEAR), ]) # user_prompt = _generate_vlm_prompt( # vlm_model=vlm_model, # vlm_processor=vlm_processor, # process_vision_info=process_vision_info, # pil_image=input_image, # device=device, # ) # with torch.no_grad(): seed_everything(seed) generator = torch.Generator(device=device) validation_prompt = "" lq = tensor_transforms(input_image).unsqueeze(0).to(device).half() lq = ram_transforms(lq) res = inference(lq, tag_model) ram_encoder_hidden_states = tag_model.generate_image_embeds(lq) validation_prompt = f"{res[0]}, {positive_prompt}," validation_prompt = validation_prompt if user_prompt=='' else f"{user_prompt}, {validation_prompt}" ori_width, ori_height = input_image.size resize_flag = False rscale = scale_factor input_image = input_image.resize((int(input_image.size[0] * rscale), int(input_image.size[1] * rscale))) if min(input_image.size) < process_size: input_image = resize_preproc(input_image) input_image = input_image.resize((input_image.size[0] // 8 * 8, input_image.size[1] // 8 * 8)) width, height = input_image.size resize_flag = True # images = [] for _ in range(sample_times): try: with torch.autocast("cuda"): image = validation_pipeline( validation_prompt, input_image, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, generator=generator, height=height, width=width, guidance_scale=cfg_scale, conditioning_scale=1, start_point='lr', start_steps=999,ram_encoder_hidden_states=ram_encoder_hidden_states, latent_tiled_size=latent_tiled_size, latent_tiled_overlap=latent_tiled_overlap, ).images[0] if True: # alpha<1.0: image = wavelet_color_fix(image, input_image) if resize_flag: image = image.resize((ori_width * rscale, ori_height * rscale)) except Exception as e: print(e) image = Image.new(mode="RGB", size=(512, 512)) images.append(np.array(image)) return input_image, images[0] css = """ #col-container { margin: 0 auto; max-width: 1024px; } """ theme = gr.themes.Ocean() with gr.Blocks(css=css, theme=theme) as demo: with gr.Column(elem_id="col-container"): with gr.Row(): gr.HTML( """

🖼️ Super-Resolution

""" ) with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", height=512) run_button = gr.Button("🔎 Magnify 4x", variant="primary") duration_time = gr.Text(label="duration time", value=60, visible=False) with gr.Accordion("Options", visible=False): user_prompt = gr.Textbox(label="User Prompt", value="") positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece") negative_prompt = gr.Textbox( label="Negative Prompt", value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality" ) cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=7.5, step=0) num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=50, step=1) seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231) sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1) latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1) latent_tiled_overlap = gr.Slider(label="Diffusion Tile Overlap", minimum=4, maximum=16, value=4, step=1) scale_factor = gr.Number(label="SR Scale", value=4) with gr.Column(): result_gallery = ImageSlider( interactive=False, label="Magnified", position=0.5 ) examples = gr.Examples( examples=[ [ "preset/datasets/test_datasets/179.png", ], [ "preset/datasets/test_datasets/cinema.png", ], [ "preset/datasets/test_datasets/cartoon.png", ], ], inputs=[ input_image, ], outputs=[result_gallery], fn=preprocess_n_magnify, cache_examples=True, ) inputs = [ input_image, ] run_button.click(fn=preprocess_n_magnify, inputs=input_image, outputs=[result_gallery]) input_image.upload(fn=preprocess_image,inputs=input_image, outputs=input_image, show_api=False) demo.launch(share=True, mcp_server=True)