| from argparse import Namespace |
| import json |
| from pathlib import Path |
|
|
| import numpy as np |
| import random |
| from torchvision import transforms |
| import torch |
| import cv2 |
| import torchvision.transforms.functional as TF |
| import scipy.ndimage as ndimage |
| from PIL import Image |
| import argparse |
| import imgaug.augmenters as iaa |
| from imgaug.augmentables import Keypoint, KeypointsOnImage |
|
|
| MAX_HW = 384 |
| IM_NORM_MEAN = [0.485, 0.456, 0.406] |
| IM_NORM_STD = [0.229, 0.224, 0.225] |
|
|
| def get_args_parser(): |
| parser = argparse.ArgumentParser('MAE pre-training', add_help=False) |
| parser.add_argument('--batch_size', default=8, type=int, |
| help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') |
| parser.add_argument('--epochs', default=200, type=int) |
| parser.add_argument('--accum_iter', default=1, type=int, |
| help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') |
|
|
| |
| parser.add_argument('--model', default='mae_vit_base_patch16', type=str, metavar='MODEL', |
| help='Name of model to train') |
|
|
| parser.add_argument('--mask_ratio', default=0.5, type=float, |
| help='Masking ratio (percentage of removed patches).') |
|
|
| parser.add_argument('--norm_pix_loss', action='store_true', |
| help='Use (per-patch) normalized pixels as targets for computing loss') |
| parser.set_defaults(norm_pix_loss=False) |
|
|
| |
| parser.add_argument('--weight_decay', type=float, default=0.05, |
| help='weight decay (default: 0.05)') |
| parser.add_argument('--lr', type=float, default=None, metavar='LR', |
| help='learning rate (absolute lr)') |
| parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', |
| help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') |
| parser.add_argument('--min_lr', type=float, default=0., metavar='LR', |
| help='lower lr bound for cyclic schedulers that hit 0') |
| parser.add_argument('--warmup_epochs', type=int, default=10, metavar='N', |
| help='epochs to warmup LR') |
|
|
| |
| parser.add_argument('--data_path', default='./data/FSC147/', type=str, |
| help='dataset path') |
| parser.add_argument('--anno_file', default='annotation_FSC147_384.json', type=str, |
| help='annotation json file') |
| parser.add_argument('--data_split_file', default='Train_Test_Val_FSC_147.json', type=str, |
| help='data split json file') |
| parser.add_argument('--im_dir', default='images_384_VarV2', type=str, |
| help='images directory') |
| parser.add_argument('--gt_dir', default='./data/FSC147/gt_density_map_adaptive_384_VarV2', type=str, |
| help='ground truth directory') |
| parser.add_argument('--output_dir', default='./data/out/pre_4_dir', |
| help='path where to save, empty for no saving') |
| parser.add_argument('--device', default='cuda', |
| help='device to use for training / testing') |
| parser.add_argument('--seed', default=0, type=int) |
| parser.add_argument('--resume', default='./weights/mae_pretrain_vit_base_full.pth', |
| help='resume from checkpoint') |
|
|
| |
| parser.add_argument('--start_epoch', default=0, type=int, metavar='N', |
| help='start epoch') |
| parser.add_argument('--num_workers', default=10, type=int) |
| parser.add_argument('--pin_mem', action='store_true', |
| help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') |
| parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') |
| parser.set_defaults(pin_mem=True) |
|
|
| |
| parser.add_argument('--world_size', default=1, type=int, |
| help='number of distributed processes') |
| parser.add_argument('--local_rank', default=-1, type=int) |
| parser.add_argument('--dist_on_itp', action='store_true') |
| parser.add_argument('--dist_url', default='env://', |
| help='url used to set up distributed training') |
|
|
| |
| parser.add_argument('--log_dir', default='./logs/pre_4_dir', |
| help='path where to tensorboard log') |
| parser.add_argument("--title", default="CounTR_pretraining", type=str) |
| parser.add_argument("--wandb", default="counting", type=str) |
| parser.add_argument("--team", default="wsense", type=str) |
| parser.add_argument("--wandb_id", default=None, type=str) |
| parser.add_argument("--do_aug", default=True, type=bool) |
| parser.add_argument('--class_file', default='./data/FSC147/ImageClasses_FSC147.txt', type=str, |
| help='class json file') |
| return parser |
|
|
| args = get_args_parser() |
| args = args.parse_args() |
|
|
| class ResizeSomeImage(object): |
| def __init__(self, args): |
| args = get_args_parser() |
| args = args.parse_args() |
| |
| self.data_path = Path(args.data_path) |
| self.im_dir = self.data_path/args.im_dir |
| anno_file = self.data_path/args.anno_file |
| data_split_file = self.data_path/args.data_split_file |
|
|
| with open(anno_file) as f: |
| self.annotations = json.load(f) |
|
|
| with open(data_split_file) as f: |
| data_split = json.load(f) |
|
|
| self.train_set = data_split['train'] |
|
|
| self.class_dict = {} |
| if args.do_aug: |
| with open(args.class_file) as f: |
| for line in f: |
| key = line.split()[0] |
| val = line.split()[1:] |
| self.class_dict[key] = val |
|
|
|
|
| class ResizePreTrainImage(ResizeSomeImage): |
| """ |
| Resize the image so that: |
| 1. Image is equal to 384 * 384 |
| 2. The new height and new width are divisible by 16 |
| 3. The aspect ratio is preserved |
| Density and boxes correctness not preserved(crop and horizontal flip) |
| """ |
|
|
| def __init__(self, args, MAX_HW=384): |
| super().__init__(args) |
| self.max_hw = MAX_HW |
|
|
| def __call__(self, sample): |
| image, lines_boxes, density = sample['image'], sample['lines_boxes'], sample['gt_density'] |
|
|
| W, H = image.size |
|
|
| new_H = 16 * int(H / 16) |
| new_W = 16 * int(W / 16) |
| resized_image = transforms.Resize((new_H, new_W))(image) |
| resized_density = cv2.resize(density, (new_W, new_H)) |
| orig_count = np.sum(density) |
| new_count = np.sum(resized_density) |
|
|
| if new_count > 0: |
| resized_density = resized_density * (orig_count / new_count) |
|
|
| boxes = list() |
| for box in lines_boxes: |
| box2 = [int(k) for k in box] |
| y1, x1, y2, x2 = box2[0], box2[1], box2[2], box2[3] |
| boxes.append([0, y1, x1, y2, x2]) |
|
|
| boxes = torch.Tensor(boxes).unsqueeze(0) |
| resized_image = PreTrainNormalize(resized_image) |
| resized_density = torch.from_numpy(resized_density).unsqueeze(0).unsqueeze(0) |
| sample = {'image': resized_image, 'boxes': boxes, 'gt_density': resized_density} |
| return sample |
|
|
|
|
| class ResizeTrainImage(ResizeSomeImage): |
| """ |
| Resize the image so that: |
| 1. Image is equal to 384 * 384 |
| 2. The new height and new width are divisible by 16 |
| 3. The aspect ratio is possibly preserved |
| Density map is cropped to have the same size(and position) with the cropped image |
| Exemplar boxes may be outside the cropped area. |
| Augmentation including Gaussian noise, Color jitter, Gaussian blur, Random affine, Random horizontal flip and Mosaic (or Random Crop if no Mosaic) is used. |
| """ |
|
|
| def __init__(self, args, MAX_HW=384, do_aug=True): |
| super().__init__(args) |
| self.max_hw = MAX_HW |
| self.do_aug = do_aug |
|
|
| def __call__(self, sample): |
| image, lines_boxes, neg_lines_boxes, dots, im_id, m_flag = sample['image'], sample['lines_boxes'], sample['neg_lines_boxes'], \ |
| sample['dots'], sample['id'], sample['m_flag'] |
|
|
| W, H = image.size |
|
|
| new_H = 16 * int(H / 16) |
| new_W = 16 * int(W / 16) |
| scale_factor_h = float(new_H) / H |
| scale_factor_w = float(new_W) / W |
| resized_image = transforms.Resize((new_H, new_W))(image) |
| resized_image = TTensor(resized_image) |
| resized_density = np.zeros((new_H, new_W), dtype='float32') |
|
|
| |
| aug_flag = self.do_aug |
| mosaic_flag = random.random() < 0.25 |
|
|
| if aug_flag: |
| |
| noise = np.random.normal(0, 0.1, resized_image.size()) |
| noise = torch.from_numpy(noise) |
| re_image = resized_image + noise |
| re_image = torch.clamp(re_image, 0, 1) |
|
|
| |
| re_image = Augmentation(re_image) |
|
|
| |
| re1_image = re_image.transpose(0, 1).transpose(1, 2).numpy() |
| keypoints = [] |
| for i in range(dots.shape[0]): |
| keypoints.append(Keypoint(x=min(new_W - 1, int(dots[i][0] * scale_factor_w)), y=min(new_H - 1, int(dots[i][1] * scale_factor_h)))) |
| kps = KeypointsOnImage(keypoints, re1_image.shape) |
|
|
| seq = iaa.Sequential([ |
| iaa.Affine( |
| rotate=(-15, 15), |
| scale=(0.8, 1.2), |
| shear=(-10, 10), |
| translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)} |
| ) |
| ]) |
| re1_image, kps_aug = seq(image=re1_image, keypoints=kps) |
|
|
| |
| resized_density = np.zeros((resized_density.shape[0], resized_density.shape[1]), dtype='float32') |
| for i in range(len(kps.keypoints)): |
| if (int(kps_aug.keypoints[i].y) <= new_H - 1 and int(kps_aug.keypoints[i].x) <= new_W - 1) and not \ |
| kps_aug.keypoints[i].is_out_of_image(re1_image): |
| resized_density[int(kps_aug.keypoints[i].y)][int(kps_aug.keypoints[i].x)] = 1 |
| resized_density = torch.from_numpy(resized_density) |
|
|
| re_image = TTensor(re1_image) |
|
|
| |
| flip_p = random.random() |
| if flip_p > 0.5: |
| re_image = TF.hflip(re_image) |
| resized_density = TF.hflip(resized_density) |
|
|
| |
| if mosaic_flag: |
| image_array = [] |
| map_array = [] |
| blending_l = random.randint(10, 20) |
| resize_l = 192 + 2 * blending_l |
| if dots.shape[0] >= 70: |
| for i in range(4): |
| length = random.randint(150, 384) |
| start_W = random.randint(0, new_W - length) |
| start_H = random.randint(0, new_H - length) |
| reresized_image1 = TF.crop(resized_image, start_H, start_W, length, length) |
| reresized_image1 = transforms.Resize((resize_l, resize_l))(reresized_image1) |
| reresized_density1 = np.zeros((resize_l, resize_l), dtype='float32') |
| for i in range(dots.shape[0]): |
| if start_H <= min(new_H - 1, int(dots[i][1] * scale_factor_h)) < start_H + length and start_W <= min(new_W - 1, int(dots[i][0] * scale_factor_w)) < start_W + length: |
| reresized_density1[min(resize_l-1,int((min(new_H-1,int(dots[i][1] * scale_factor_h))-start_H)*resize_l/length))][min(resize_l-1,int((min(new_W-1,int(dots[i][0] * scale_factor_w))-start_W)*resize_l/length))]=1 |
| reresized_density1 = torch.from_numpy(reresized_density1) |
| image_array.append(reresized_image1) |
| map_array.append(reresized_density1) |
| else: |
| m_flag = 1 |
| prob = random.random() |
| if prob > 0.25: |
| gt_pos = random.randint(0, 3) |
| else: |
| gt_pos = random.randint(0, 4) |
| for i in range(4): |
| if i == gt_pos: |
| Tim_id = im_id |
| r_image = resized_image |
| Tdots = dots |
| new_TH = new_H |
| new_TW = new_W |
| Tscale_factor_w = scale_factor_w |
| Tscale_factor_h = scale_factor_h |
| else: |
| Tim_id = self.train_set[random.randint(0, len(self.train_set) - 1)] |
| Tdots = np.array(self.annotations[Tim_id]['points']) |
| Timage = Image.open('{}/{}'.format(self.im_dir, Tim_id)) |
| Timage.load() |
| new_TW = 16 * int(Timage.size[0] / 16) |
| new_TH = 16 * int(Timage.size[1] / 16) |
| Tscale_factor_w = float(new_TW) / Timage.size[0] |
| Tscale_factor_h = float(new_TH) / Timage.size[1] |
| r_image = TTensor(transforms.Resize((new_TH, new_TW))(Timage)) |
|
|
| length = random.randint(250, 384) |
| start_W = random.randint(0, new_TW - length) |
| start_H = random.randint(0, new_TH - length) |
| r_image1 = TF.crop(r_image, start_H, start_W, length, length) |
| r_image1 = transforms.Resize((resize_l, resize_l))(r_image1) |
| r_density1 = np.zeros((resize_l, resize_l), dtype='float32') |
| |
| |
| |
| |
| |
| |
| |
| if self.class_dict[im_id] == self.class_dict[Tim_id]: |
| |
| |
| |
| |
| |
| |
| |
| |
| for i in range(Tdots.shape[0]): |
| if start_H <= min(new_TH - 1, int(Tdots[i][1] * Tscale_factor_h)) < start_H + length and start_W <= min(new_TW - 1, int(Tdots[i][0] * Tscale_factor_w)) < start_W + length: |
| r_density1[min(resize_l-1,int((min(new_TH-1, int(Tdots[i][1] * Tscale_factor_h))-start_H)*resize_l/length))][min(resize_l-1,int((min(new_TW-1,int(Tdots[i][0] * Tscale_factor_w))-start_W)*resize_l/length))]=1 |
| r_density1 = torch.from_numpy(r_density1) |
| image_array.append(r_image1) |
| map_array.append(r_density1) |
|
|
| reresized_image5 = torch.cat((image_array[0][:, blending_l:resize_l-blending_l], image_array[1][:, blending_l: resize_l-blending_l]), 1) |
| reresized_density5 = torch.cat((map_array[0][blending_l:resize_l-blending_l], map_array[1][blending_l: resize_l-blending_l]), 0) |
| for i in range(blending_l): |
| reresized_image5[:, 192+i] = image_array[0][:, resize_l-1-blending_l+i] * (blending_l-i)/(2 * blending_l) + reresized_image5[:, 192+i] * (i+blending_l)/(2*blending_l) |
| reresized_image5[:, 191-i] = image_array[1][:, blending_l-i] * (blending_l-i)/(2*blending_l) + reresized_image5[:, 191-i] * (i+blending_l)/(2*blending_l) |
| reresized_image5 = torch.clamp(reresized_image5, 0, 1) |
|
|
| reresized_image6 = torch.cat((image_array[2][:, blending_l:resize_l-blending_l], image_array[3][:, blending_l: resize_l-blending_l]), 1) |
| reresized_density6 = torch.cat((map_array[2][blending_l:resize_l-blending_l], map_array[3][blending_l:resize_l-blending_l]), 0) |
| for i in range(blending_l): |
| reresized_image6[:, 192+i] = image_array[2][:, resize_l-1-blending_l+i] * (blending_l-i)/(2*blending_l) + reresized_image6[:, 192+i] * (i+blending_l)/(2*blending_l) |
| reresized_image6[:, 191-i] = image_array[3][:, blending_l-i] * (blending_l-i)/(2*blending_l) + reresized_image6[:, 191-i] * (i+blending_l)/(2*blending_l) |
| reresized_image6 = torch.clamp(reresized_image6, 0, 1) |
|
|
| reresized_image = torch.cat((reresized_image5[:, :, blending_l:resize_l-blending_l], reresized_image6[:, :, blending_l:resize_l-blending_l]), 2) |
| reresized_density = torch.cat((reresized_density5[:, blending_l:resize_l-blending_l], reresized_density6[:, blending_l:resize_l-blending_l]), 1) |
| for i in range(blending_l): |
| reresized_image[:, :, 192+i] = reresized_image5[:, :, resize_l-1-blending_l+i] * (blending_l-i)/(2*blending_l) + reresized_image[:, :, 192+i] * (i+blending_l)/(2*blending_l) |
| reresized_image[:, :, 191-i] = reresized_image6[:, :, blending_l-i] * (blending_l-i)/(2*blending_l) + reresized_image[:, :, 191-i] * (i+blending_l)/(2*blending_l) |
| reresized_image = torch.clamp(reresized_image, 0, 1) |
|
|
| else: |
| |
| start = random.randint(0, new_W - 1 - 383) |
| reresized_image = TF.crop(re_image, 0, start, 384, 384) |
| reresized_density = resized_density[:, start:start + 384] |
|
|
| else: |
| |
| for i in range(dots.shape[0]): |
| resized_density[min(new_H - 1, int(dots[i][1] * scale_factor_h))] \ |
| [min(new_W - 1, int(dots[i][0] * scale_factor_w))] = 1 |
| resized_density = torch.from_numpy(resized_density) |
| start = random.randint(0, new_W - self.max_hw) |
| reresized_image = TF.crop(resized_image, 0, start, self.max_hw, self.max_hw) |
| reresized_density = resized_density[0:self.max_hw, start:start + self.max_hw] |
|
|
| |
| reresized_density = ndimage.gaussian_filter(reresized_density.numpy(), sigma=(1, 1), order=0) |
|
|
| |
| reresized_density = reresized_density * 60 |
| reresized_density = torch.from_numpy(reresized_density) |
|
|
| |
| boxes = list() |
| rects = list() |
| cnt = 0 |
| for box in lines_boxes: |
| cnt += 1 |
| if cnt > 3: |
| break |
| box2 = [int(k) for k in box] |
| y1 = int(box2[0] * scale_factor_h) |
| x1 = int(box2[1] * scale_factor_w) |
| y2 = int(box2[2] * scale_factor_h) |
| x2 = int(box2[3] * scale_factor_w) |
| |
| if not aug_flag: |
| rects.append(torch.tensor([y1, max(0, x1-start), y2, min(self.max_hw, x2-start)])) |
| bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1] |
| bbox = transforms.Resize((64, 64))(bbox) |
| boxes.append(bbox) |
| boxes = torch.stack(boxes) |
| neg_boxes = list() |
| neg_rects = list() |
| cnt = 0 |
| for box in neg_lines_boxes: |
| cnt += 1 |
| if cnt > 3: |
| break |
| box2 = [int(k) for k in box] |
| y1 = int(box2[0] * scale_factor_h) |
| x1 = int(box2[1] * scale_factor_w) |
| y2 = int(box2[2] * scale_factor_h) |
| x2 = int(box2[3] * scale_factor_w) |
| |
| if not aug_flag: |
| neg_rects.append(torch.tensor([y1, max(0, x1-start), y2, min(self.max_hw, x2-start)])) |
| neg_bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1] |
| neg_bbox = transforms.Resize((64, 64))(neg_bbox) |
| neg_boxes.append(neg_bbox) |
| neg_boxes = torch.stack(neg_boxes) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if aug_flag: |
| pos = torch.tensor([]) |
| else: |
| pos = torch.stack(rects) |
|
|
| |
| sample = {'image': reresized_image, 'boxes': boxes, 'neg_boxes': neg_boxes, 'pos': pos, 'gt_density': reresized_density, 'm_flag': m_flag} |
|
|
| return sample |
|
|
|
|
| class ResizeValImage(ResizeSomeImage): |
| def __init__(self, args, MAX_HW=384): |
| super().__init__(args) |
| self.max_hw = MAX_HW |
|
|
| def __call__(self, sample): |
| image, dots, m_flag, lines_boxes, neg_lines_boxes = sample['image'], sample['dots'], sample['m_flag'], sample['lines_boxes'], sample['neg_lines_boxes'] |
|
|
| W, H = image.size |
|
|
| new_H = new_W = self.max_hw |
| scale_factor_h = float(new_H) / H |
| scale_factor_w = float(new_W) / W |
| resized_image = transforms.Resize((new_H, new_W))(image) |
| resized_image = TTensor(resized_image) |
|
|
| |
| resized_density = np.zeros((new_H, new_W), dtype='float32') |
| for i in range(dots.shape[0]): |
| resized_density[min(new_H - 1, int(dots[i][1] * scale_factor_h))] \ |
| [min(new_W - 1, int(dots[i][0] * scale_factor_w))] = 1 |
| |
| resized_density = ndimage.gaussian_filter(resized_density, sigma=4, order=0) |
| resized_density = torch.from_numpy(resized_density) * 60 |
|
|
| |
| boxes = list() |
| rects = list() |
| cnt = 0 |
| for box in lines_boxes: |
| cnt += 1 |
| if cnt > 3: |
| break |
| box2 = [int(k) for k in box] |
| y1 = int(box2[0] * scale_factor_h) |
| x1 = int(box2[1] * scale_factor_w) |
| y2 = int(box2[2] * scale_factor_h) |
| x2 = int(box2[3] * scale_factor_w) |
| rects.append(torch.tensor([y1, x1, y2, x2])) |
| bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1] |
| bbox = transforms.Resize((64, 64))(bbox) |
| boxes.append(bbox) |
| boxes = torch.stack(boxes) |
| pos = torch.stack(rects) |
| neg_boxes = list() |
| neg_rects = list() |
| cnt = 0 |
| for box in neg_lines_boxes: |
| cnt += 1 |
| if cnt > 3: |
| break |
| box2 = [int(k) for k in box] |
| y1 = int(box2[0] * scale_factor_h) |
| x1 = int(box2[1] * scale_factor_w) |
| y2 = int(box2[2] * scale_factor_h) |
| x2 = int(box2[3] * scale_factor_w) |
| neg_rects.append(torch.tensor([y1, x1, y2, x2])) |
| neg_bbox = resized_image[:, y1:y2 + 1, x1:x2 + 1] |
| neg_bbox = transforms.Resize((64, 64))(neg_bbox) |
| neg_boxes.append(neg_bbox) |
| neg_boxes = torch.stack(neg_boxes) |
| |
| sample = {'image': resized_image, 'boxes': boxes, 'neg_boxes': neg_boxes, 'pos': pos, 'gt_density': resized_density, 'm_flag': m_flag} |
| return sample |
|
|
|
|
| PreTrainNormalize = transforms.Compose([ |
| transforms.RandomResizedCrop(MAX_HW, scale=(0.2, 1.0), interpolation=3), |
| transforms.RandomHorizontalFlip(), |
| transforms.ToTensor(), |
| |
| ]) |
|
|
| TTensor = transforms.Compose([ |
| transforms.ToTensor(), |
| ]) |
|
|
| Augmentation = transforms.Compose([ |
| transforms.ColorJitter(brightness=0.25, contrast=0.15, saturation=0.15, hue=0.15), |
| transforms.GaussianBlur(kernel_size=(7, 9)) |
| ]) |
|
|
| Normalize = transforms.Compose([ |
| transforms.ToTensor(), |
| transforms.Normalize(mean=IM_NORM_MEAN, std=IM_NORM_STD) |
| ]) |
|
|
|
|
| def transform_train(args: Namespace, do_aug=True): |
| return transforms.Compose([ResizeTrainImage(args, MAX_HW, do_aug)]) |
|
|
| def transform_val(args: Namespace): |
| return transforms.Compose([ResizeValImage(args, MAX_HW)]) |
|
|
| def transform_pre_train(args: Namespace): |
| return transforms.Compose([ResizePreTrainImage(args, MAX_HW)]) |
|
|