Spaces:

alexnasa
/

SuperResolution

Running on Zero

App Files Files Community

SuperResolution / app.py

alexnasa

Update app.py

c18d8a2 verified 20 days ago

raw

history blame contribute delete

13.1 kB

	import spaces
	import gradio as gr
	import os
	import sys
	from typing import List
	# sys.path.append(os.getcwd())

	import numpy as np
	from PIL import Image

	import torch
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from gradio_imageslider import ImageSlider

	print(f'torch version:{torch.__version__}')


	import torch.utils.checkpoint
	from pytorch_lightning import seed_everything
	from diffusers import AutoencoderKL, DDIMScheduler
	from diffusers.utils import check_min_version
	from diffusers.utils.import_utils import is_xformers_available
	from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor
	from huggingface_hub import hf_hub_download, snapshot_download

	from pipelines.pipeline_seesr import StableDiffusionControlNetPipeline

	from utils.wavelet_color_fix import wavelet_color_fix, adain_color_fix

	from ram.models.ram_lora import ram
	from ram import inference_ram as inference
	from torchvision import transforms
	from models.controlnet import ControlNetModel
	from models.unet_2d_condition import UNet2DConditionModel

	# VLM_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"

	# vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	# VLM_NAME,
	# torch_dtype="auto",
	# device_map="auto" # immediately dispatches layers onto available GPUs
	# )
	# vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)

	def _generate_vlm_prompt(
	vlm_model: Qwen2_5_VLForConditionalGeneration,
	vlm_processor: AutoProcessor,
	process_vision_info,
	pil_image: Image.Image,
	device: str = "cuda"
	) -> str:
	"""
	Given two PIL.Image inputs:
	- prev_pil: the “full” image at the previous recursion.
	- zoomed_pil: the cropped+resized (zoom) image for this step.
	Returns a single “recursive_multiscale” prompt string.
	"""

	message_text = (
	"The give a detailed description of this image."
	"describe each element with fine details."
	)

	messages = [
	{"role": "system", "content": message_text},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": pil_image},
	],
	},
	]

	text = vlm_processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = vlm_processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to(device)

	generated = vlm_model.generate(**inputs, max_new_tokens=128)
	trimmed = [
	out_ids[len(in_ids):]
	for in_ids, out_ids in zip(inputs.input_ids, generated)
	]
	out_text = vlm_processor.batch_decode(
	trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	return out_text.strip()

	tensor_transforms = transforms.Compose([
	transforms.ToTensor(),
	])

	ram_transforms = transforms.Compose([
	transforms.Resize((384, 384)),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
	])

	snapshot_download(
	repo_id="alexnasa/SEESR",
	local_dir="preset/models"
	)


	snapshot_download(
	repo_id="stabilityai/stable-diffusion-2-1-base",
	local_dir="preset/models/stable-diffusion-2-1-base"
	)

	snapshot_download(
	repo_id="xinyu1205/recognize_anything_model",
	local_dir="preset/models/"
	)


	# Load scheduler, tokenizer and models.
	pretrained_model_path = 'preset/models/stable-diffusion-2-1-base'
	seesr_model_path = 'preset/models/seesr'

	scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
	text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
	tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
	vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
	feature_extractor = CLIPImageProcessor.from_pretrained(f"{pretrained_model_path}/feature_extractor")
	unet = UNet2DConditionModel.from_pretrained(seesr_model_path, subfolder="unet")
	controlnet = ControlNetModel.from_pretrained(seesr_model_path, subfolder="controlnet")

	# Freeze vae and text_encoder
	vae.requires_grad_(False)
	text_encoder.requires_grad_(False)
	unet.requires_grad_(False)
	controlnet.requires_grad_(False)

	# unet.to("cuda")
	# controlnet.to("cuda")
	# unet.enable_xformers_memory_efficient_attention()
	# controlnet.enable_xformers_memory_efficient_attention()

	# Get the validation pipeline
	validation_pipeline = StableDiffusionControlNetPipeline(
	vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, feature_extractor=None,
	unet=unet, controlnet=controlnet, scheduler=scheduler, safety_checker=None, requires_safety_checker=False,
	)

	validation_pipeline._init_tiled_vae(encoder_tile_size=1024,
	decoder_tile_size=224)
	weight_dtype = torch.float16
	device = "cuda"


	# Move text_encode and vae to gpu and cast to weight_dtype
	text_encoder.to(device, dtype=weight_dtype)
	vae.to(device, dtype=weight_dtype)
	unet.to(device, dtype=weight_dtype)
	controlnet.to(device, dtype=weight_dtype)


	tag_model = ram(pretrained='preset/models/ram_swin_large_14m.pth',
	pretrained_condition='preset/models/DAPE.pth',
	image_size=384,
	vit='swin_l')
	tag_model.eval()
	tag_model.to(device, dtype=weight_dtype)

	def preprocess_image(input_image: Image.Image) -> Image.Image:
	img = input_image.copy()
	img.thumbnail((512, 512), Image.Resampling.BILINEAR)
	return img

	@spaces.GPU(duration=130)
	def preprocess_n_magnify(input_image: Image.Image, progress=gr.Progress(track_tqdm=True),):
	"""
	Preprocess the input image and perform a single-step 4× magnification using the SeeSR pipeline.

	This function first resizes the input to fit within a 512×512 thumbnail, then applies the full
	magnification through ControlNet-guided diffusion—to produce a high-resolution, 4× upscaled image.

	Args:
	input_image (PIL.Image.Image): The source image to preprocess and magnify.

	Returns:
	tuple[PIL.Image.Image, PIL.Image.Image]:
	- The resized (thumbnail) version of the input.
	- The final 4× magnified output image.
	"""

	processed_img = preprocess_image(input_image)

	img, magnified_img = magnify(processed_img, progress=progress)

	return (img, magnified_img)

	@spaces.GPU()
	def magnify(
	input_image: Image.Image,
	user_prompt = "",
	positive_prompt = "clean, high-resolution, 8k, best quality, masterpiece",
	negative_prompt = "dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
	num_inference_steps = 50,
	scale_factor = 4,
	cfg_scale = 7.5,
	seed = 123,
	latent_tiled_size = 320,
	latent_tiled_overlap = 4,
	sample_times = 1,
	progress=gr.Progress(track_tqdm=True),
	):


	process_size = 512
	resize_preproc = transforms.Compose([
	transforms.Resize(process_size, interpolation=transforms.InterpolationMode.BILINEAR),
	])
	# user_prompt = _generate_vlm_prompt(
	# vlm_model=vlm_model,
	# vlm_processor=vlm_processor,
	# process_vision_info=process_vision_info,
	# pil_image=input_image,
	# device=device,
	# )

	# with torch.no_grad():
	seed_everything(seed)
	generator = torch.Generator(device=device)

	validation_prompt = ""
	lq = tensor_transforms(input_image).unsqueeze(0).to(device).half()
	lq = ram_transforms(lq)
	res = inference(lq, tag_model)
	ram_encoder_hidden_states = tag_model.generate_image_embeds(lq)
	validation_prompt = f"{res[0]}, {positive_prompt},"
	validation_prompt = validation_prompt if user_prompt=='' else f"{user_prompt}, {validation_prompt}"

	ori_width, ori_height = input_image.size
	resize_flag = False

	rscale = scale_factor
	input_image = input_image.resize((int(input_image.size[0] * rscale), int(input_image.size[1] * rscale)))

	if min(input_image.size) < process_size:
	input_image = resize_preproc(input_image)

	input_image = input_image.resize((input_image.size[0] // 8 * 8, input_image.size[1] // 8 * 8))
	width, height = input_image.size
	resize_flag = True #

	images = []
	for _ in range(sample_times):
	try:
	with torch.autocast("cuda"):
	image = validation_pipeline(
	validation_prompt, input_image, negative_prompt=negative_prompt,
	num_inference_steps=num_inference_steps, generator=generator,
	height=height, width=width,
	guidance_scale=cfg_scale, conditioning_scale=1,
	start_point='lr', start_steps=999,ram_encoder_hidden_states=ram_encoder_hidden_states,
	latent_tiled_size=latent_tiled_size, latent_tiled_overlap=latent_tiled_overlap,
	).images[0]

	if True: # alpha<1.0:
	image = wavelet_color_fix(image, input_image)

	if resize_flag:
	image = image.resize((ori_width * rscale, ori_height * rscale))
	except Exception as e:
	print(e)
	image = Image.new(mode="RGB", size=(512, 512))
	images.append(np.array(image))
	return input_image, images[0]


	css = """
	#col-container {
	margin: 0 auto;
	max-width: 1024px;
	}
	"""
	theme = gr.themes.Ocean()

	with gr.Blocks(css=css, theme=theme) as demo:

	with gr.Column(elem_id="col-container"):

	with gr.Row():
	gr.HTML(
	"""
	<div style="text-align: center;">
	<p style="font-size:16px; display: inline; margin: 0;">
	<strong>🖼️ Super-Resolution</strong>
	</p>
	</div>
	"""
	)
	with gr.Row():
	with gr.Column():
	input_image = gr.Image(type="pil", height=512)
	run_button = gr.Button("🔎 Magnify 4x", variant="primary")
	duration_time = gr.Text(label="duration time", value=60, visible=False)
	with gr.Accordion("Options", visible=False):
	user_prompt = gr.Textbox(label="User Prompt", value="")
	positive_prompt = gr.Textbox(label="Positive Prompt", value="clean, high-resolution, 8k, best quality, masterpiece")
	negative_prompt = gr.Textbox(
	label="Negative Prompt",
	value="dotted, noise, blur, lowres, oversmooth, longbody, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"
	)
	cfg_scale = gr.Slider(label="Classifier Free Guidance Scale (Set to 1.0 in sd-turbo)", minimum=1, maximum=10, value=7.5, step=0)
	num_inference_steps = gr.Slider(label="Inference Steps", minimum=2, maximum=100, value=50, step=1)
	seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, value=231)
	sample_times = gr.Slider(label="Sample Times", minimum=1, maximum=10, step=1, value=1)
	latent_tiled_size = gr.Slider(label="Diffusion Tile Size", minimum=128, maximum=480, value=320, step=1)
	latent_tiled_overlap = gr.Slider(label="Diffusion Tile Overlap", minimum=4, maximum=16, value=4, step=1)
	scale_factor = gr.Number(label="SR Scale", value=4)
	with gr.Column():
	result_gallery = ImageSlider(
	interactive=False,
	label="Magnified",
	position=0.5
	)
	examples = gr.Examples(
	examples=[
	[
	"preset/datasets/test_datasets/179.png",
	],
	[
	"preset/datasets/test_datasets/cinema.png",
	],
	[
	"preset/datasets/test_datasets/cartoon.png",
	],

	],
	inputs=[
	input_image,
	],
	outputs=[result_gallery],
	fn=preprocess_n_magnify,
	cache_examples=True,
	)
	inputs = [
	input_image,
	]
	run_button.click(fn=preprocess_n_magnify, inputs=input_image, outputs=[result_gallery])
	input_image.upload(fn=preprocess_image,inputs=input_image, outputs=input_image, show_api=False)

	demo.launch(share=True, mcp_server=True)