Spaces:

jbilcke-hf
/

ReCamMaster

Running on L40S

App Files Files Community

ReCamMaster / diffsynth /extensions /ImageQualityMetric /imagereward.py

jbilcke-hf HF Staff

Upload 210 files

4bf9661 verified 11 days ago

raw

history blame contribute delete

8.73 kB

	import os
	import torch
	from PIL import Image
	from typing import List, Union
	from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
	from .BLIP.blip_pretrain import BLIP_Pretrain
	from torchvision.transforms import InterpolationMode
	from safetensors.torch import load_file
	from .config import MODEL_PATHS
	BICUBIC = InterpolationMode.BICUBIC

	def _convert_image_to_rgb(image):
	return image.convert("RGB")

	def _transform(n_px):
	return Compose([
	Resize(n_px, interpolation=BICUBIC),
	CenterCrop(n_px),
	_convert_image_to_rgb,
	ToTensor(),
	Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
	])

	class MLP(torch.nn.Module):
	def __init__(self, input_size):
	super().__init__()
	self.input_size = input_size

	self.layers = torch.nn.Sequential(
	torch.nn.Linear(self.input_size, 1024),
	#nn.ReLU(),
	torch.nn.Dropout(0.2),
	torch.nn.Linear(1024, 128),
	#nn.ReLU(),
	torch.nn.Dropout(0.2),
	torch.nn.Linear(128, 64),
	#nn.ReLU(),
	torch.nn.Dropout(0.1),
	torch.nn.Linear(64, 16),
	#nn.ReLU(),
	torch.nn.Linear(16, 1)
	)

	# initial MLP param
	for name, param in self.layers.named_parameters():
	if 'weight' in name:
	torch.nn.init.normal_(param, mean=0.0, std=1.0/(self.input_size+1))
	if 'bias' in name:
	torch.nn.init.constant_(param, val=0)

	def forward(self, input):
	return self.layers(input)

	class ImageReward(torch.nn.Module):
	def __init__(self, med_config, device='cpu', bert_model_path=""):
	super().__init__()
	self.device = device

	self.blip = BLIP_Pretrain(image_size=224, vit='large', med_config=med_config, bert_model_path=bert_model_path)
	self.preprocess = _transform(224)
	self.mlp = MLP(768)

	self.mean = 0.16717362830052426
	self.std = 1.0333394966054072

	def score_grad(self, prompt_ids, prompt_attention_mask, image):
	"""Calculate the score with gradient for a single image and prompt.

	Args:
	prompt_ids (torch.Tensor): Tokenized prompt IDs.
	prompt_attention_mask (torch.Tensor): Attention mask for the prompt.
	image (torch.Tensor): The processed image tensor.

	Returns:
	torch.Tensor: The reward score.
	"""
	image_embeds = self.blip.visual_encoder(image)
	image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
	text_output = self.blip.text_encoder(
	prompt_ids,
	attention_mask=prompt_attention_mask,
	encoder_hidden_states=image_embeds,
	encoder_attention_mask=image_atts,
	return_dict=True,
	)
	txt_features = text_output.last_hidden_state[:, 0, :]
	rewards = self.mlp(txt_features)
	rewards = (rewards - self.mean) / self.std
	return rewards

	def score(self, images: Union[str, List[str], Image.Image, List[Image.Image]], prompt: str = "") -> List[float]:
	"""Score the images based on the prompt.

	Args:
	prompt (str): The prompt text.
	images (Union[str, List[str], Image.Image, List[Image.Image]]): Path(s) to the image(s) or PIL image(s).

	Returns:
	List[float]: List of scores for the images.
	"""
	if isinstance(images, (str, Image.Image)):
	# Single image
	if isinstance(images, str):
	pil_image = Image.open(images)
	else:
	pil_image = images
	image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
	return [self._calculate_score(prompt, image).item()]
	elif isinstance(images, list):
	# Multiple images
	scores = []
	for one_image in images:
	if isinstance(one_image, str):
	pil_image = Image.open(one_image)
	elif isinstance(one_image, Image.Image):
	pil_image = one_image
	else:
	raise TypeError("The type of parameter images is illegal.")
	image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
	scores.append(self._calculate_score(prompt, image).item())
	return scores
	else:
	raise TypeError("The type of parameter images is illegal.")

	def _calculate_score(self, prompt: str, image: torch.Tensor) -> torch.Tensor:
	"""Calculate the score for a single image and prompt.

	Args:
	prompt (str): The prompt text.
	image (torch.Tensor): The processed image tensor.

	Returns:
	torch.Tensor: The reward score.
	"""
	text_input = self.blip.tokenizer(prompt, padding='max_length', truncation=True, max_length=35, return_tensors="pt").to(self.device)
	image_embeds = self.blip.visual_encoder(image)
	image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
	text_output = self.blip.text_encoder(
	text_input.input_ids,
	attention_mask=text_input.attention_mask,
	encoder_hidden_states=image_embeds,
	encoder_attention_mask=image_atts,
	return_dict=True,
	)
	txt_features = text_output.last_hidden_state[:, 0, :].float()
	rewards = self.mlp(txt_features)
	rewards = (rewards - self.mean) / self.std
	return rewards

	def inference_rank(self, prompt: str, generations_list: List[Union[str, Image.Image]]) -> tuple:
	"""Rank the images based on the prompt.

	Args:
	prompt (str): The prompt text.
	generations_list (List[Union[str, Image.Image]]): List of image paths or PIL images.

	Returns:
	tuple: (indices, rewards) where indices are the ranks and rewards are the scores.
	"""
	text_input = self.blip.tokenizer(prompt, padding='max_length', truncation=True, max_length=35, return_tensors="pt").to(self.device)
	txt_set = []
	for generation in generations_list:
	if isinstance(generation, str):
	pil_image = Image.open(generation)
	elif isinstance(generation, Image.Image):
	pil_image = generation
	else:
	raise TypeError("The type of parameter generations_list is illegal.")
	image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
	image_embeds = self.blip.visual_encoder(image)
	image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
	text_output = self.blip.text_encoder(
	text_input.input_ids,
	attention_mask=text_input.attention_mask,
	encoder_hidden_states=image_embeds,
	encoder_attention_mask=image_atts,
	return_dict=True,
	)
	txt_set.append(text_output.last_hidden_state[:, 0, :])
	txt_features = torch.cat(txt_set, 0).float()
	rewards = self.mlp(txt_features)
	rewards = (rewards - self.mean) / self.std
	rewards = torch.squeeze(rewards)
	_, rank = torch.sort(rewards, dim=0, descending=True)
	_, indices = torch.sort(rank, dim=0)
	indices = indices + 1
	return indices.detach().cpu().numpy().tolist(), rewards.detach().cpu().numpy().tolist()


	class ImageRewardScore(torch.nn.Module):
	def __init__(self, device: Union[str, torch.device], path: str = MODEL_PATHS):
	super().__init__()
	self.device = device if isinstance(device, torch.device) else torch.device(device)
	model_path = path.get("imagereward")
	med_config = path.get("med_config")
	state_dict = load_file(model_path)
	self.model = ImageReward(device=self.device, med_config=med_config, bert_model_path=path.get("bert_model_path")).to(self.device)
	self.model.load_state_dict(state_dict, strict=False)
	self.model.eval()

	@torch.no_grad()
	def score(self, images: Union[str, List[str], Image.Image, List[Image.Image]], prompt: str) -> List[float]:
	"""Score the images based on the prompt.

	Args:
	images (Union[str, List[str], Image.Image, List[Image.Image]]): Path(s) to the image(s) or PIL image(s).
	prompt (str): The prompt text.

	Returns:
	List[float]: List of scores for the images.
	"""
	return self.model.score(images, prompt)