Spaces:

anonymous-upload-neurips-2025
/

PinPoint

Running

App Files Files Community

PinPoint / Finetuning /src /open_clip /extract_features.py

anonymous-upload-neurips-2025

Upload 221 files

88c922f verified 26 days ago

raw

history blame contribute delete

24.4 kB

	import os
	# import clip
	import torch
	import open_clip

	import numpy as np
	from sklearn.linear_model import LogisticRegression
	from torchvision.datasets import CIFAR100
	from tqdm import tqdm
	from joblib import dump, load
	from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
	import torchvision.transforms as transforms

	import torchvision

	import pandas as pd
	from pathlib import Path
	from PIL import Image
	from torch.utils.data import Dataset, DataLoader
	import pickle

	class PHASE(Dataset):
	"""PHASE dataset."""

	def __init__(self, csv_file, root_dir, transform=None, resolution=224):
	"""
	Arguments:
	csv_file (string): Path to the csv file with annotations.
	root_dir (string): Directory with all the images.
	transform (callable, optional): Optional transform to be applied
	on a sample.
	"""
	self.annotations = pd.read_csv(csv_file, sep=' ', header=None)
	# print(self.annotations)
	self.root_dir = root_dir
	self.transform = transform
	self.base_transforms = Compose([
	Resize((resolution, resolution), interpolation=Image.BICUBIC)
	])

	def __len__(self):
	return len(self.annotations)

	def __getitem__(self, idx):
	if torch.is_tensor(idx):
	idx = idx.tolist()

	img_name = os.path.join(self.root_dir,
	self.annotations.iloc[idx, 0])
	image = Image.open(img_name).convert('RGB')
	label = self.annotations.iloc[idx, 1]

	image = self.base_transforms(image)

	if self.transform:
	image = self.transform(image)

	sample = {'image': image, 'label': label}

	# print(image, label)
	return image, label


	class FACET(Dataset):
	"""Face Landmarks dataset."""

	def __init__(self, csv_file, root_dir, transform=None):
	"""
	Arguments:
	csv_file (string): Path to the csv file with annotations.
	root_dir (string): Directory with all the images.
	transform (callable, optional): Optional transform to be applied
	on a sample.
	"""
	self.annotations = pd.read_csv(csv_file, sep=' ', header=None)
	self.root_dir = root_dir
	self.transform = transform

	def __len__(self):
	return len(self.annotations)

	def __getitem__(self, idx):
	if torch.is_tensor(idx):
	idx = idx.tolist()

	img_name = os.path.join(self.root_dir,
	self.annotations.iloc[idx, 0])
	image = Image.open(img_name).convert('RGB')
	label = self.annotations.iloc[idx, 1]

	base_transforms = Compose([
	Resize((224, 224), interpolation=Image.BICUBIC)
	])

	image = base_transforms(image)

	if self.transform:
	image = self.transform(image)

	return image, label


	class MORPH(Dataset):
	"""MORPH dataset."""

	def __init__(self, csv_file, root_dir, transform=None):
	"""
	Arguments:
	csv_file (string): Path to the csv file with annotations.
	root_dir (string): Directory with all the images.
	transform (callable, optional): Optional transform to be applied
	on a sample.
	"""
	self.annotations = pd.read_csv(csv_file, sep=',', header=0)
	self.root_dir = root_dir
	self.transform = transform

	def __len__(self):
	return len(self.annotations)

	def __getitem__(self, idx):
	if torch.is_tensor(idx):
	idx = idx.tolist()

	img_name = os.path.join(self.annotations.iloc[idx]["filepath"])

	image = Image.open(f"{img_name}").convert('RGB')
	label = self.annotations.iloc[idx]["gender"]

	base_transforms = Compose([
	Resize((224, 224), interpolation=Image.BICUBIC)
	])

	image = base_transforms(image)

	if self.transform:
	image = self.transform(image)


	return image, label


	# Load the model
	device = "cuda" if torch.cuda.is_available() else "cpu"

	device = "cuda" if torch.cuda.is_available() else "cpu"
	resnet_model = torchvision.models.resnet50(pretrained=False)
	resnet_model.fc = torch.nn.Identity()
	resnet_model.eval()
	features_root = "features_facet_training_set"
	cls_root = "classifiers_facet"
	models = (
	# # Supervised
	# "resnet18",
	# "resnet34",
	# "resnet50",
	# "resnet101",
	# "resnet152",
	# "vit_b_16",
	# "vit_b_32",
	# "vit_l_16",
	# "vit_l_32",

	# # Self-Supervised RN50
	# 'swav',
	# 'simclr',
	# 'moco_v2',
	# 'npid',
	# 'deepcluster_v2',
	# 'jigsaw',
	# 'odc',

	# # DINO v1
	# 'dino_v1_cnn',
	# 'dino_v1_vit_b_16',
	# "dino_v1_vit_s_16",

	# # DINO v2
	# "dino_v2_vit_s_14",
	# 'dino_v2_vit_b_14',
	# "dino_v2_vit_l_14",
	# "dino_v2_vit_g_14",

	# # CLIP OpenAI
	# "ViT-B/16",
	# "ViT-B/32",
	# "ViT-L/14",
	# "ViT-L/14@336px",
	# "RN50",
	# "RN101",

	# # CLIP OpenCLIP
	#"vit_b_16_400m",
	# "vit_b_16_2b",
	# "vit_l_14_400m",
	# "vit_l_14_2b",
	# "vit_b_32_400m",
	# "vit_b_32_2b",


	# OpenCLIP CC3M - ours
	# "vit_b_16_cc3m_50_28ep",
	"vit_b_16_cc3m_50_30ep",
	# "vit_b_16_cc3m_50",
	"vit_b_16_cc3m_original",
	"vit_b_16_cc3m_50_30ep_difficult_batches",


	# OpenCLIP CC3M - full regeneration
	"rn50_cc3m_mix_000",
	"rn50_cc3m_mix_100",
	)


	weights = (
	# # ResNet's
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# # ViT's
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# "supervised_torch_hub",
	# # SSL
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/swav_backbone.pth",
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/simclr_backbone.pth",
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/moco_v2_backbone.pth",
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/npid_backbone.pth",
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/deepcluster_v2_backbone.pth",
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/jigsaw_backbone.pth",
	# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/odc_r50_v1-5af5dd0c.pth",
	# # DINO-v1
	# "dino_facebook_hub",
	# "dino_facebook_hub",
	# "dino_facebook_hub",
	# # DINO-v2
	# "dino_facebook_hub",
	# "dino_facebook_hub",
	# "dino_facebook_hub",
	# "dino_facebook_hub",
	# # CLIP OpenAI
	# "OpenAI hub",
	# "OpenAI hub",
	# "OpenAI hub",
	# "OpenAI hub",
	# "OpenAI hub",
	# "OpenAI hub",
	# # CLIP OpenCLIP
	#"OpenCLIP hub",
	# "OpenCLIP hub",
	# "OpenCLIP hub",
	# "OpenCLIP hub",
	# "OpenCLIP hub",
	# "OpenCLIP hub",

	# OpenCLIP CC3M - ours
	# "/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_08_27-11_48_49-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_28.pt",
	"/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_08_27-11_48_49-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_30.pt",
	# "/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_08_27-11_48_49-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_16.pt",
	"/home/kis/Desktop/rhome/kis/code/open_clip/logs/2024_07_12-19_17_23-model_ViT-B-16-lr_0.001-b_410-j_4-p_amp/checkpoints/epoch_30.pt",
	"/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_09_15-14_07_26-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_30.pt",

	# OpenCLIP CC3M - full regeneration
	"/home/kis/code/models/models/cc3m_mix_000/epoch_50.pt",
	"/home/kis/code/models/models/cc3m_mix_100/epoch_50.pt",
	)




	model_idx=0
	model_type = 'transformer'
	for model_name, weight in zip(models, weights):
	print( "\n\n",model_name)

	preprocess = None
	clip_like = False
	if model_name == 'dino_v1_cnn':
	model = torch.hub.load('facebookresearch/dino:main', 'dino_resnet50')
	model.fc = torch.nn.Identity()
	model.eval()
	model_type = 'cnn'

	elif model_name == "vit_b_16_cc3m_50":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
	elif model_name == "vit_b_16_cc3m_50_28ep":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
	elif model_name == "vit_b_16_cc3m_50_30ep_difficult_batches":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
	elif model_name == "vit_b_16_cc3m_50_30ep":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
	elif model_name == "rn50_cc3m_mix_000":
	model, _, preprocess = open_clip.create_model_and_transforms('RN50', pretrained=weight)
	elif model_name == "vit_b_16_cc3m_future_models":
	model, _, preprocess = open_clip.create_model_and_transforms('RN50', pretrained=weight)
	elif model_name == "vit_b_16_cc3m_original":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
	elif model_name == 'dino_v1_vit_s_16':
	model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
	elif model_name == 'dino_v1_vit_b_16':
	model = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16')
	elif model_name == 'dino_v2_vit_s_14':
	model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
	elif model_name == 'dino_v2_vit_b_14':
	model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
	elif model_name == 'dino_v2_vit_l_14':
	model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
	elif model_name == 'dino_v2_vit_g_14':
	model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
	elif model_name == "vit_b_16_400m":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='laion400m_e32')
	clip_like = True
	elif model_name == "vit_b_16_2b":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='laion2b_s34b_b88k')
	clip_like = True
	elif model_name == "vit_b_32_400m":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion400m_e32')
	clip_like = True
	elif model_name == "vit_b_32_2b":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
	clip_like = True
	elif model_name == "vit_l_14_400m":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion400m_e32')
	clip_like = True
	elif model_name == "vit_l_14_2b":
	model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion2b_s32b_b82k')
	clip_like = True
	elif "resnet" in model_name:
	model = torch.hub.load('pytorch/vision:v0.10.0', model_name, pretrained=True)
	model.fc = torch.nn.Identity()
	model.eval()
	model_type = 'cnn'
	elif "vit" in model_name:
	model = torch.hub.load('pytorch/vision', model_name, weights='IMAGENET1K_V1')
	model.heads = torch.nn.Identity()
	elif "ViT" in model_name:
	model, preprocess = clip.load(model_name, device)
	clip_like = True
	elif "RN" in model_name:
	model, preprocess = clip.load(model_name, device)
	model.visual.attnpool = torch.nn.AdaptiveAvgPool2d((1,1)) # replace Attention pool with Avgpool
	clip_like = True
	model_type = 'cnn'
	else:
	w = torch.load(weight)
	model = torchvision.models.resnet50(pretrained=False)
	model.fc = torch.nn.Identity()
	model.eval()
	model.load_state_dict(update_keys(w['state_dict']), strict=True)
	model_type = 'cnn'

	model.cuda()


	if 'simclr' in model_name:
	img_norm_cfg = dict(mean=[0., 0., 0.], std=[1., 1., 1.])
	else:
	img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])


	transform_test = transforms.Compose([
	transforms.CenterCrop((224, 224)),
	transforms.ToTensor(),
	transforms.Normalize(**img_norm_cfg)
	])

	# # model_name = 'CLIP_RN50'

	# features_root = "../../features/MORPH/"
	# # features_root = "../../features/FACET/"


	# train_dataset = MORPH(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/Index/Train.csv',
	# root_dir='/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/',
	# transform=transform_test
	# )

	# val_dataset = MORPH(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/Index/Validation.csv',
	# root_dir='/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/',
	# transform=transform_test
	# )

	# def get_features(dataset):
	# all_features = []
	# all_labels = []

	# with torch.no_grad():
	# for images, labels in tqdm(DataLoader(dataset, batch_size=512)):
	# if model_type == 'cnn':
	# features = model(images.to(device))
	# else:
	# features = model.encode_image(images.to(device))

	# print(features.shape)
	# all_features.append(features)
	# all_labels.append(labels)

	# return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

	# # Calculate the image features

	# feat_path = Path(f"{features_root}/{model_name}_features.pkl")
	# if feat_path.exists():
	# print("Already extracted!")
	# with open(f"{features_root}/{model_name}_features.pkl", 'rb') as f:
	# features = pickle.load(f)
	# with open(f"{features_root}/{model_name}_labels.pkl", 'rb') as f:
	# labels = pickle.load(f)
	# with open(f"{features_root}/{model_name}_features_val.pkl", 'rb') as f:
	# features_val = pickle.load(f)
	# with open(f"{features_root}/{model_name}_labels_val.pkl", 'rb') as f:
	# labels_val = pickle.load(f)

	# else:
	# features, labels = get_features(train_dataset)
	# print(labels)
	# with open(f"{features_root}/{model_name}_features.pkl", 'wb') as f:
	# pickle.dump(features, f)

	# with open(f"{features_root}/{model_name}_labels.pkl", 'wb') as f:
	# pickle.dump(labels, f)

	# features_val, labels_val = get_features(val_dataset)
	# with open(f"{features_root}/{model_name}_features_val.pkl", 'wb') as f:
	# pickle.dump(features_val, f)

	# with open(f"{features_root}/{model_name}_labels_val.pkl", 'wb') as f:
	# pickle.dump(labels_val, f)


	# print("Done!")


	# for i in range(1, 10):
	# c = i * 0.1
	# classifier = LogisticRegression(random_state=0, C=c, max_iter=10000, verbose=0)
	# classifier.fit(features, labels)
	# # classifier = load(f'{model_name}_logistic_regression_classifier_c_{c}.joblib')
	# predictions = classifier.predict(features_val)
	# dump(classifier, f'{features_root}/{model_name}_logistic_regression_classifier_c_{c}.joblib')
	# with open(f'{features_root}/{model_name}_predictions_c_{c}.pkl', 'wb') as f:
	# pickle.dump(predictions, f)

	# pd.DataFrame(predictions).to_csv(f"{features_root}/{model_name}_predictions_c_{c}.txt")
	# accuracy = np.mean((labels_val == predictions).astype(float)) * 100.
	# print(f"C={c}, Accuracy = {accuracy:.3f}")


	# features_root = "../../features/FACET/"



	# train_dataset = FACET(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/facet/train_seed_0.csv',
	# root_dir='/home/kis/Desktop/rhome/kis/datasets/facet/images_bb/',
	# transform=preprocess)
	# val_dataset = FACET(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/facet/test_val_seed_0.csv',
	# root_dir='/home/kis/Desktop/rhome/kis/datasets/facet/images_bb/',
	# transform=preprocess)

	# def get_features(dataset):
	# all_features = []
	# all_labels = []

	# with torch.no_grad():
	# for images, labels in tqdm(DataLoader(dataset, batch_size=512)):
	# if model_type == 'cnn':
	# features = model(images.to(device))
	# else:
	# features = model.encode_image(images.to(device))

	# print(features.shape)
	# all_features.append(features)
	# all_labels.append(labels)

	# return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

	# # Calculate the image features

	# feat_path = Path(f"{features_root}/{model_name}_features.pkl")
	# if feat_path.exists():
	# print("Already extracted!")
	# with open(f"{features_root}/{model_name}_features.pkl", 'rb') as f:
	# features = pickle.load(f)
	# with open(f"{features_root}/{model_name}_labels.pkl", 'rb') as f:
	# labels = pickle.load(f)
	# with open(f"{features_root}/{model_name}_features_val.pkl", 'rb') as f:
	# features_val = pickle.load(f)
	# with open(f"{features_root}/{model_name}_labels_val.pkl", 'rb') as f:
	# labels_val = pickle.load(f)

	# else:
	# features, labels = get_features(train_dataset)
	# with open(f"{features_root}/{model_name}_features.pkl", 'wb') as f:
	# pickle.dump(features, f)

	# with open(f"{features_root}/{model_name}_labels.pkl", 'wb') as f:
	# pickle.dump(labels, f)

	# features_val, labels_val = get_features(val_dataset)
	# with open(f"{features_root}/{model_name}_features_val.pkl", 'wb') as f:
	# pickle.dump(features_val, f)

	# with open(f"{features_root}/{model_name}_labels_val.pkl", 'wb') as f:
	# pickle.dump(labels_val, f)


	# print("Done!")


	# for i in range(1, 10):
	# c = i * 0.1
	# classifier = LogisticRegression(random_state=0, C=c, max_iter=10000, verbose=0)
	# classifier.fit(features, labels)
	# # classifier = load(f'{model_name}_logistic_regression_classifier_c_{c}.joblib')
	# predictions = classifier.predict(features_val)
	# dump(classifier, f'{features_root}/{model_name}_logistic_regression_classifier_c_{c}.joblib')
	# with open(f'{features_root}/{model_name}_predictions_c_{c}.pkl', 'wb') as f:
	# pickle.dump(predictions, f)

	# pd.DataFrame(predictions).to_csv(f"{features_root}/{model_name}_predictions_c_{c}.txt")
	# accuracy = np.mean((labels_val == predictions).astype(float)) * 100.
	# print(f"C={c}, Accuracy = {accuracy:.3f}")


	features_root = "../../features/PHASE_EMOTIONS/"


	train_dataset = PHASE(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/phase/phase_annotations/train_annotations_emotion.txt',
	root_dir='/home/kis/Desktop/rhome/kis/datasets/phase/images/train_bb/',
	transform=transform_test
	)
	val_dataset = PHASE(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/phase/phase_annotations/val_annotations_emotion.txt',
	root_dir='/home/kis/Desktop/rhome/kis/datasets/phase/images/val_bb/',
	transform=transform_test
	)



	def get_features(dataset):
	all_features = []
	all_labels = []

	with torch.no_grad():
	for images, labels in tqdm(DataLoader(dataset, batch_size=512)):
	if model_type == 'cnn':
	features = model(images.to(device))
	else:
	features = model.encode_image(images.to(device))

	print(features.shape)
	all_features.append(features)
	all_labels.append(labels)

	return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

	# Calculate the image features

	feat_path = Path(f"{features_root}/{model_name}_features.pkl")
	if feat_path.exists():
	print("Already extracted!")
	with open(f"{features_root}/{model_name}_features.pkl", 'rb') as f:
	features = pickle.load(f)
	with open(f"{features_root}/{model_name}_labels.pkl", 'rb') as f:
	labels = pickle.load(f)
	with open(f"{features_root}/{model_name}_features_val.pkl", 'rb') as f:
	features_val = pickle.load(f)
	with open(f"{features_root}/{model_name}_labels_val.pkl", 'rb') as f:
	labels_val = pickle.load(f)

	else:
	features, labels = get_features(train_dataset)
	with open(f"{features_root}/{model_name}_features.pkl", 'wb') as f:
	pickle.dump(features, f)

	with open(f"{features_root}/{model_name}_labels.pkl", 'wb') as f:
	pickle.dump(labels, f)

	features_val, labels_val = get_features(val_dataset)
	with open(f"{features_root}/{model_name}_features_val.pkl", 'wb') as f:
	pickle.dump(features_val, f)

	with open(f"{features_root}/{model_name}_labels_val.pkl", 'wb') as f:
	pickle.dump(labels_val, f)


	print("Done!")


	for i in range(1, 10):
	c = i * 0.1
	classifier = LogisticRegression(random_state=0, C=c, max_iter=10000, verbose=0, class_weight="balanced")
	classifier.fit(features, labels)
	# classifier = load(f'{model_name}_logistic_regression_classifier_c_{c}.joblib')
	predictions = classifier.predict(features_val)
	dump(classifier, f'{features_root}/{model_name}_logistic_regression_classifier_c_{c}.joblib')
	with open(f'{features_root}/{model_name}_predictions_c_{c}.pkl', 'wb') as f:
	pickle.dump(predictions, f)

	pd.DataFrame(predictions).to_csv(f"{features_root}/{model_name}_predictions_c_{c}.txt")
	accuracy = np.mean((labels_val == predictions).astype(float)) * 100.
	print(f"C={c}, Accuracy = {accuracy:.3f}")