PinPoint / Finetuning /src /open_clip /extract_features.py
anonymous-upload-neurips-2025's picture
Upload 221 files
88c922f verified
import os
# import clip
import torch
import open_clip
import numpy as np
from sklearn.linear_model import LogisticRegression
from torchvision.datasets import CIFAR100
from tqdm import tqdm
from joblib import dump, load
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import torchvision.transforms as transforms
import torchvision
import pandas as pd
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pickle
class PHASE(Dataset):
"""PHASE dataset."""
def __init__(self, csv_file, root_dir, transform=None, resolution=224):
"""
Arguments:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.annotations = pd.read_csv(csv_file, sep=' ', header=None)
# print(self.annotations)
self.root_dir = root_dir
self.transform = transform
self.base_transforms = Compose([
Resize((resolution, resolution), interpolation=Image.BICUBIC)
])
def __len__(self):
return len(self.annotations)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,
self.annotations.iloc[idx, 0])
image = Image.open(img_name).convert('RGB')
label = self.annotations.iloc[idx, 1]
image = self.base_transforms(image)
if self.transform:
image = self.transform(image)
sample = {'image': image, 'label': label}
# print(image, label)
return image, label
class FACET(Dataset):
"""Face Landmarks dataset."""
def __init__(self, csv_file, root_dir, transform=None):
"""
Arguments:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.annotations = pd.read_csv(csv_file, sep=' ', header=None)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.root_dir,
self.annotations.iloc[idx, 0])
image = Image.open(img_name).convert('RGB')
label = self.annotations.iloc[idx, 1]
base_transforms = Compose([
Resize((224, 224), interpolation=Image.BICUBIC)
])
image = base_transforms(image)
if self.transform:
image = self.transform(image)
return image, label
class MORPH(Dataset):
"""MORPH dataset."""
def __init__(self, csv_file, root_dir, transform=None):
"""
Arguments:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.annotations = pd.read_csv(csv_file, sep=',', header=0)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img_name = os.path.join(self.annotations.iloc[idx]["filepath"])
image = Image.open(f"{img_name}").convert('RGB')
label = self.annotations.iloc[idx]["gender"]
base_transforms = Compose([
Resize((224, 224), interpolation=Image.BICUBIC)
])
image = base_transforms(image)
if self.transform:
image = self.transform(image)
return image, label
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet_model = torchvision.models.resnet50(pretrained=False)
resnet_model.fc = torch.nn.Identity()
resnet_model.eval()
features_root = "features_facet_training_set"
cls_root = "classifiers_facet"
models = (
# # Supervised
# "resnet18",
# "resnet34",
# "resnet50",
# "resnet101",
# "resnet152",
# "vit_b_16",
# "vit_b_32",
# "vit_l_16",
# "vit_l_32",
# # Self-Supervised RN50
# 'swav',
# 'simclr',
# 'moco_v2',
# 'npid',
# 'deepcluster_v2',
# 'jigsaw',
# 'odc',
# # DINO v1
# 'dino_v1_cnn',
# 'dino_v1_vit_b_16',
# "dino_v1_vit_s_16",
# # DINO v2
# "dino_v2_vit_s_14",
# 'dino_v2_vit_b_14',
# "dino_v2_vit_l_14",
# "dino_v2_vit_g_14",
# # CLIP OpenAI
# "ViT-B/16",
# "ViT-B/32",
# "ViT-L/14",
# "ViT-L/14@336px",
# "RN50",
# "RN101",
# # CLIP OpenCLIP
#"vit_b_16_400m",
# "vit_b_16_2b",
# "vit_l_14_400m",
# "vit_l_14_2b",
# "vit_b_32_400m",
# "vit_b_32_2b",
# OpenCLIP CC3M - ours
# "vit_b_16_cc3m_50_28ep",
"vit_b_16_cc3m_50_30ep",
# "vit_b_16_cc3m_50",
"vit_b_16_cc3m_original",
"vit_b_16_cc3m_50_30ep_difficult_batches",
# OpenCLIP CC3M - full regeneration
"rn50_cc3m_mix_000",
"rn50_cc3m_mix_100",
)
weights = (
# # ResNet's
# "supervised_torch_hub",
# "supervised_torch_hub",
# "supervised_torch_hub",
# "supervised_torch_hub",
# "supervised_torch_hub",
# # ViT's
# "supervised_torch_hub",
# "supervised_torch_hub",
# "supervised_torch_hub",
# "supervised_torch_hub",
# # SSL
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/swav_backbone.pth",
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/simclr_backbone.pth",
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/moco_v2_backbone.pth",
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/npid_backbone.pth",
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/deepcluster_v2_backbone.pth",
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/jigsaw_backbone.pth",
# "/home/kis/Desktop/rhome/kis/code/mmselfsup/pretrained_models/official_weights/mmselfsup_format/odc_r50_v1-5af5dd0c.pth",
# # DINO-v1
# "dino_facebook_hub",
# "dino_facebook_hub",
# "dino_facebook_hub",
# # DINO-v2
# "dino_facebook_hub",
# "dino_facebook_hub",
# "dino_facebook_hub",
# "dino_facebook_hub",
# # CLIP OpenAI
# "OpenAI hub",
# "OpenAI hub",
# "OpenAI hub",
# "OpenAI hub",
# "OpenAI hub",
# "OpenAI hub",
# # CLIP OpenCLIP
#"OpenCLIP hub",
# "OpenCLIP hub",
# "OpenCLIP hub",
# "OpenCLIP hub",
# "OpenCLIP hub",
# "OpenCLIP hub",
# OpenCLIP CC3M - ours
# "/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_08_27-11_48_49-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_28.pt",
"/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_08_27-11_48_49-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_30.pt",
# "/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_08_27-11_48_49-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_16.pt",
"/home/kis/Desktop/rhome/kis/code/open_clip/logs/2024_07_12-19_17_23-model_ViT-B-16-lr_0.001-b_410-j_4-p_amp/checkpoints/epoch_30.pt",
"/home/kis/Desktop/rhome/kis/code/open_clip_latest/open_clip/logs/2024_09_15-14_07_26-model_ViT-B-16-lr_0.001-b_410-j_8-p_amp/checkpoints/epoch_30.pt",
# OpenCLIP CC3M - full regeneration
"/home/kis/code/models/models/cc3m_mix_000/epoch_50.pt",
"/home/kis/code/models/models/cc3m_mix_100/epoch_50.pt",
)
model_idx=0
model_type = 'transformer'
for model_name, weight in zip(models, weights):
print( "\n\n",model_name)
preprocess = None
clip_like = False
if model_name == 'dino_v1_cnn':
model = torch.hub.load('facebookresearch/dino:main', 'dino_resnet50')
model.fc = torch.nn.Identity()
model.eval()
model_type = 'cnn'
elif model_name == "vit_b_16_cc3m_50":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
elif model_name == "vit_b_16_cc3m_50_28ep":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
elif model_name == "vit_b_16_cc3m_50_30ep_difficult_batches":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
elif model_name == "vit_b_16_cc3m_50_30ep":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
elif model_name == "rn50_cc3m_mix_000":
model, _, preprocess = open_clip.create_model_and_transforms('RN50', pretrained=weight)
elif model_name == "vit_b_16_cc3m_future_models":
model, _, preprocess = open_clip.create_model_and_transforms('RN50', pretrained=weight)
elif model_name == "vit_b_16_cc3m_original":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained=weight)
elif model_name == 'dino_v1_vit_s_16':
model = torch.hub.load('facebookresearch/dino:main', 'dino_vits16')
elif model_name == 'dino_v1_vit_b_16':
model = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16')
elif model_name == 'dino_v2_vit_s_14':
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
elif model_name == 'dino_v2_vit_b_14':
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
elif model_name == 'dino_v2_vit_l_14':
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
elif model_name == 'dino_v2_vit_g_14':
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
elif model_name == "vit_b_16_400m":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='laion400m_e32')
clip_like = True
elif model_name == "vit_b_16_2b":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-16', pretrained='laion2b_s34b_b88k')
clip_like = True
elif model_name == "vit_b_32_400m":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion400m_e32')
clip_like = True
elif model_name == "vit_b_32_2b":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
clip_like = True
elif model_name == "vit_l_14_400m":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion400m_e32')
clip_like = True
elif model_name == "vit_l_14_2b":
model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='laion2b_s32b_b82k')
clip_like = True
elif "resnet" in model_name:
model = torch.hub.load('pytorch/vision:v0.10.0', model_name, pretrained=True)
model.fc = torch.nn.Identity()
model.eval()
model_type = 'cnn'
elif "vit" in model_name:
model = torch.hub.load('pytorch/vision', model_name, weights='IMAGENET1K_V1')
model.heads = torch.nn.Identity()
elif "ViT" in model_name:
model, preprocess = clip.load(model_name, device)
clip_like = True
elif "RN" in model_name:
model, preprocess = clip.load(model_name, device)
model.visual.attnpool = torch.nn.AdaptiveAvgPool2d((1,1)) # replace Attention pool with Avgpool
clip_like = True
model_type = 'cnn'
else:
w = torch.load(weight)
model = torchvision.models.resnet50(pretrained=False)
model.fc = torch.nn.Identity()
model.eval()
model.load_state_dict(update_keys(w['state_dict']), strict=True)
model_type = 'cnn'
model.cuda()
if 'simclr' in model_name:
img_norm_cfg = dict(mean=[0., 0., 0.], std=[1., 1., 1.])
else:
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform_test = transforms.Compose([
transforms.CenterCrop((224, 224)),
transforms.ToTensor(),
transforms.Normalize(**img_norm_cfg)
])
# # model_name = 'CLIP_RN50'
# features_root = "../../features/MORPH/"
# # features_root = "../../features/FACET/"
# train_dataset = MORPH(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/Index/Train.csv',
# root_dir='/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/',
# transform=transform_test
# )
# val_dataset = MORPH(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/Index/Validation.csv',
# root_dir='/home/kis/Desktop/rhome/kis/datasets/morph/Dataset/',
# transform=transform_test
# )
# def get_features(dataset):
# all_features = []
# all_labels = []
# with torch.no_grad():
# for images, labels in tqdm(DataLoader(dataset, batch_size=512)):
# if model_type == 'cnn':
# features = model(images.to(device))
# else:
# features = model.encode_image(images.to(device))
# print(features.shape)
# all_features.append(features)
# all_labels.append(labels)
# return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
# # Calculate the image features
# feat_path = Path(f"{features_root}/{model_name}_features.pkl")
# if feat_path.exists():
# print("Already extracted!")
# with open(f"{features_root}/{model_name}_features.pkl", 'rb') as f:
# features = pickle.load(f)
# with open(f"{features_root}/{model_name}_labels.pkl", 'rb') as f:
# labels = pickle.load(f)
# with open(f"{features_root}/{model_name}_features_val.pkl", 'rb') as f:
# features_val = pickle.load(f)
# with open(f"{features_root}/{model_name}_labels_val.pkl", 'rb') as f:
# labels_val = pickle.load(f)
# else:
# features, labels = get_features(train_dataset)
# print(labels)
# with open(f"{features_root}/{model_name}_features.pkl", 'wb') as f:
# pickle.dump(features, f)
# with open(f"{features_root}/{model_name}_labels.pkl", 'wb') as f:
# pickle.dump(labels, f)
# features_val, labels_val = get_features(val_dataset)
# with open(f"{features_root}/{model_name}_features_val.pkl", 'wb') as f:
# pickle.dump(features_val, f)
# with open(f"{features_root}/{model_name}_labels_val.pkl", 'wb') as f:
# pickle.dump(labels_val, f)
# print("Done!")
# for i in range(1, 10):
# c = i * 0.1
# classifier = LogisticRegression(random_state=0, C=c, max_iter=10000, verbose=0)
# classifier.fit(features, labels)
# # classifier = load(f'{model_name}_logistic_regression_classifier_c_{c}.joblib')
# predictions = classifier.predict(features_val)
# dump(classifier, f'{features_root}/{model_name}_logistic_regression_classifier_c_{c}.joblib')
# with open(f'{features_root}/{model_name}_predictions_c_{c}.pkl', 'wb') as f:
# pickle.dump(predictions, f)
# pd.DataFrame(predictions).to_csv(f"{features_root}/{model_name}_predictions_c_{c}.txt")
# accuracy = np.mean((labels_val == predictions).astype(float)) * 100.
# print(f"C={c}, Accuracy = {accuracy:.3f}")
# features_root = "../../features/FACET/"
# train_dataset = FACET(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/facet/train_seed_0.csv',
# root_dir='/home/kis/Desktop/rhome/kis/datasets/facet/images_bb/',
# transform=preprocess)
# val_dataset = FACET(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/facet/test_val_seed_0.csv',
# root_dir='/home/kis/Desktop/rhome/kis/datasets/facet/images_bb/',
# transform=preprocess)
# def get_features(dataset):
# all_features = []
# all_labels = []
# with torch.no_grad():
# for images, labels in tqdm(DataLoader(dataset, batch_size=512)):
# if model_type == 'cnn':
# features = model(images.to(device))
# else:
# features = model.encode_image(images.to(device))
# print(features.shape)
# all_features.append(features)
# all_labels.append(labels)
# return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
# # Calculate the image features
# feat_path = Path(f"{features_root}/{model_name}_features.pkl")
# if feat_path.exists():
# print("Already extracted!")
# with open(f"{features_root}/{model_name}_features.pkl", 'rb') as f:
# features = pickle.load(f)
# with open(f"{features_root}/{model_name}_labels.pkl", 'rb') as f:
# labels = pickle.load(f)
# with open(f"{features_root}/{model_name}_features_val.pkl", 'rb') as f:
# features_val = pickle.load(f)
# with open(f"{features_root}/{model_name}_labels_val.pkl", 'rb') as f:
# labels_val = pickle.load(f)
# else:
# features, labels = get_features(train_dataset)
# with open(f"{features_root}/{model_name}_features.pkl", 'wb') as f:
# pickle.dump(features, f)
# with open(f"{features_root}/{model_name}_labels.pkl", 'wb') as f:
# pickle.dump(labels, f)
# features_val, labels_val = get_features(val_dataset)
# with open(f"{features_root}/{model_name}_features_val.pkl", 'wb') as f:
# pickle.dump(features_val, f)
# with open(f"{features_root}/{model_name}_labels_val.pkl", 'wb') as f:
# pickle.dump(labels_val, f)
# print("Done!")
# for i in range(1, 10):
# c = i * 0.1
# classifier = LogisticRegression(random_state=0, C=c, max_iter=10000, verbose=0)
# classifier.fit(features, labels)
# # classifier = load(f'{model_name}_logistic_regression_classifier_c_{c}.joblib')
# predictions = classifier.predict(features_val)
# dump(classifier, f'{features_root}/{model_name}_logistic_regression_classifier_c_{c}.joblib')
# with open(f'{features_root}/{model_name}_predictions_c_{c}.pkl', 'wb') as f:
# pickle.dump(predictions, f)
# pd.DataFrame(predictions).to_csv(f"{features_root}/{model_name}_predictions_c_{c}.txt")
# accuracy = np.mean((labels_val == predictions).astype(float)) * 100.
# print(f"C={c}, Accuracy = {accuracy:.3f}")
features_root = "../../features/PHASE_EMOTIONS/"
train_dataset = PHASE(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/phase/phase_annotations/train_annotations_emotion.txt',
root_dir='/home/kis/Desktop/rhome/kis/datasets/phase/images/train_bb/',
transform=transform_test
)
val_dataset = PHASE(csv_file=f'/home/kis/Desktop/rhome/kis/datasets/phase/phase_annotations/val_annotations_emotion.txt',
root_dir='/home/kis/Desktop/rhome/kis/datasets/phase/images/val_bb/',
transform=transform_test
)
def get_features(dataset):
all_features = []
all_labels = []
with torch.no_grad():
for images, labels in tqdm(DataLoader(dataset, batch_size=512)):
if model_type == 'cnn':
features = model(images.to(device))
else:
features = model.encode_image(images.to(device))
print(features.shape)
all_features.append(features)
all_labels.append(labels)
return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
# Calculate the image features
feat_path = Path(f"{features_root}/{model_name}_features.pkl")
if feat_path.exists():
print("Already extracted!")
with open(f"{features_root}/{model_name}_features.pkl", 'rb') as f:
features = pickle.load(f)
with open(f"{features_root}/{model_name}_labels.pkl", 'rb') as f:
labels = pickle.load(f)
with open(f"{features_root}/{model_name}_features_val.pkl", 'rb') as f:
features_val = pickle.load(f)
with open(f"{features_root}/{model_name}_labels_val.pkl", 'rb') as f:
labels_val = pickle.load(f)
else:
features, labels = get_features(train_dataset)
with open(f"{features_root}/{model_name}_features.pkl", 'wb') as f:
pickle.dump(features, f)
with open(f"{features_root}/{model_name}_labels.pkl", 'wb') as f:
pickle.dump(labels, f)
features_val, labels_val = get_features(val_dataset)
with open(f"{features_root}/{model_name}_features_val.pkl", 'wb') as f:
pickle.dump(features_val, f)
with open(f"{features_root}/{model_name}_labels_val.pkl", 'wb') as f:
pickle.dump(labels_val, f)
print("Done!")
for i in range(1, 10):
c = i * 0.1
classifier = LogisticRegression(random_state=0, C=c, max_iter=10000, verbose=0, class_weight="balanced")
classifier.fit(features, labels)
# classifier = load(f'{model_name}_logistic_regression_classifier_c_{c}.joblib')
predictions = classifier.predict(features_val)
dump(classifier, f'{features_root}/{model_name}_logistic_regression_classifier_c_{c}.joblib')
with open(f'{features_root}/{model_name}_predictions_c_{c}.pkl', 'wb') as f:
pickle.dump(predictions, f)
pd.DataFrame(predictions).to_csv(f"{features_root}/{model_name}_predictions_c_{c}.txt")
accuracy = np.mean((labels_val == predictions).astype(float)) * 100.
print(f"C={c}, Accuracy = {accuracy:.3f}")