Fix example, load weights safely and remove extra whitespace (#2)

Browse files

- Fix example, load weights safely, remove whitespace (e840827775a894eaf8241a70535c2921b36bd430)

Files changed (3) hide show

README.md +11 -11
image2.png +0 -0
model.py +27 -27

README.md CHANGED Viewed

@@ -13,34 +13,34 @@ tags:
 # BEN - Background Erase Network (Beta Base Model)
-BEN is a deep learning model designed to automatically remove backgrounds from images, producing both a mask and a foreground image.
 - MADE IN AMERICA
 ## Quick Start Code
 ```python
-from BEN import model
 from PIL import Image
 import torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-file = "./image2.jpg" # input image
-model = model.BEN_Base().to(device).eval() #init pipeline
-model.loadcheckpoints("./BEN/BEN_Base.pth")
 image = Image.open(file)
-mask, foreground = model.inference(image)
 mask.save("./mask.png")
 foreground.save("./foreground.png")
 ```
-# BEN SOA Benchmarks on Disk 5k Eval
 ![Demo Results](demo.jpg)
@@ -84,4 +84,4 @@ foreground.save("./foreground.png")
 ## Installation
 1. Clone Repo
-2. Install requirements.txt

 # BEN - Background Erase Network (Beta Base Model)
+BEN is a deep learning model designed to automatically remove backgrounds from images, producing both a mask and a foreground image.
 - MADE IN AMERICA
 ## Quick Start Code
 ```python
+import model
 from PIL import Image
 import torch
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+file = "./image2.png" # input image
+model = model.BEN_Base().to(device).eval() #init pipeline
+model.loadcheckpoints("./BEN_Base.pth")
 image = Image.open(file)
+with torch.no_grad():
+    mask, foreground = model.inference(image)
 mask.save("./mask.png")
 foreground.save("./foreground.png")
 ```
+# BEN SOA Benchmarks on Disk 5k Eval
 ![Demo Results](demo.jpg)
 ## Installation
 1. Clone Repo
+2. Install requirements.txt

image2.png ADDED Viewed

model.py CHANGED Viewed

@@ -560,7 +560,7 @@ class SwinTransformer(nn.Module):
             # interpolate the position embedding to the corresponding size
             absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
             x = (x + absolute_pos_embed) # B Wh*Ww C
         outs = [x.contiguous()]
         x = x.flatten(2).transpose(1, 2)
         x = self.pos_drop(x)
@@ -634,7 +634,7 @@ class PositionEmbeddingSine:
             scale = 2 * math.pi
         self.scale = scale
         self.dim_t = torch.arange(0, self.num_pos_feats, dtype=torch.float32)
     def __call__(self, b, h, w):
         device = self.dim_t.device
         mask = torch.zeros([b, h, w], dtype=torch.bool, device=device)
@@ -646,18 +646,18 @@ class PositionEmbeddingSine:
             eps = 1e-6
             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
         dim_t = self.temperature ** (2 * (self.dim_t.to(device) // 2) / self.num_pos_feats)
         pos_x = x_embed[:, :, :, None] / dim_t
         pos_y = y_embed[:, :, :, None] / dim_t
         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
-class MCLM(nn.Module):
     def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
         super(MCLM, self).__init__()
         self.attention = nn.ModuleList([
@@ -688,10 +688,10 @@ class MCLM(nn.Module):
         l: 4,c,h,w
         g: 1,c,h,w
         """
-        b, c, h, w = l.size()
         # 4,c,h,w -> 1,c,2h,2w
         concated_locs = rearrange(l, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
         pools = []
         for pool_ratio in self.pool_ratios:
              # b,c,h,w
@@ -734,7 +734,7 @@ class MCLM(nn.Module):
         l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
         l_hw_b_c = self.norm1(l_hw_b_c)
         l_hw_b_c = l_hw_b_c + self.dropout2(self.linear4(self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
-        l_hw_b_c = self.norm2(l_hw_b_c)
         l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
         return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
@@ -770,42 +770,42 @@ class MCRM(nn.Module):
     def forward(self, x):
         device = x.device
-        b, c, h, w = x.size()
         loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
         patched_glb = rearrange(glb, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
         token_attention_map = self.sigmoid(self.sal_conv(glb))
         token_attention_map = F.interpolate(token_attention_map, size=patches2image(loc).shape[-2:], mode='nearest')
         loc = loc * rearrange(token_attention_map, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
         pools = []
         for pool_ratio in self.pool_ratios:
             tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
             pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
             pools.append(rearrange(pool, 'nl c h w -> nl c (h w)'))  # nl(4),c,hw
         pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
         loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
         outputs = []
         for i, q in enumerate(loc_.unbind(dim=0)):  # traverse all local patches
             v = pools[i]
             k = v
             outputs.append(self.attention[i](q, k, v)[0])
-        outputs = torch.cat(outputs, 1)
         src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
         src = self.norm1(src)
         src = src + self.dropout2(self.linear4(self.dropout(self.activation(self.linear3(src)).clone())))
         src = self.norm2(src)
         src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
         glb = glb + F.interpolate(patches2image(src), size=glb.shape[-2:], mode='nearest')  # freshed glb
         return torch.cat((src, glb), 0), token_attention_map
-class BEN_Base(nn.Module):
     def __init__(self):
         super().__init__()
@@ -868,7 +868,7 @@ class BEN_Base(nn.Module):
         e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
         e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
-        e4 = self.conv4(e4)
         e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
         e3 = self.conv3(e3)
         e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
@@ -909,11 +909,11 @@ class BEN_Base(nn.Module):
         return blurred_mask, foreground
     def loadcheckpoints(self,model_path):
-        model_dict = torch.load(model_path,map_location="cpu")
         self.load_state_dict(model_dict['model_state_dict'], strict=True)
         del model_path
 def rgb_loader_refiner( original_image):
@@ -923,16 +923,16 @@ def rgb_loader_refiner( original_image):
         # Convert to RGB if necessary
         if image.mode != 'RGB':
             image = image.convert('RGB')
         # Resize the image
         image = image.resize((1024, 1024), resample=Image.LANCZOS)
-        return image.convert('RGB'), h, w,original_image
 # Define the image transformation
 img_transform = transforms.Compose([
     transforms.ToTensor(),
-    transforms.ConvertImageDtype(torch.float32),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])

             # interpolate the position embedding to the corresponding size
             absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
             x = (x + absolute_pos_embed) # B Wh*Ww C
         outs = [x.contiguous()]
         x = x.flatten(2).transpose(1, 2)
         x = self.pos_drop(x)
             scale = 2 * math.pi
         self.scale = scale
         self.dim_t = torch.arange(0, self.num_pos_feats, dtype=torch.float32)
     def __call__(self, b, h, w):
         device = self.dim_t.device
         mask = torch.zeros([b, h, w], dtype=torch.bool, device=device)
             eps = 1e-6
             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
         dim_t = self.temperature ** (2 * (self.dim_t.to(device) // 2) / self.num_pos_feats)
         pos_x = x_embed[:, :, :, None] / dim_t
         pos_y = y_embed[:, :, :, None] / dim_t
         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+class MCLM(nn.Module):
     def __init__(self, d_model, num_heads, pool_ratios=[1, 4, 8]):
         super(MCLM, self).__init__()
         self.attention = nn.ModuleList([
         l: 4,c,h,w
         g: 1,c,h,w
         """
+        b, c, h, w = l.size()
         # 4,c,h,w -> 1,c,2h,2w
         concated_locs = rearrange(l, '(hg wg b) c h w -> b c (hg h) (wg w)', hg=2, wg=2)
         pools = []
         for pool_ratio in self.pool_ratios:
              # b,c,h,w
         l_hw_b_c = l_hw_b_c + self.dropout1(outputs_re)
         l_hw_b_c = self.norm1(l_hw_b_c)
         l_hw_b_c = l_hw_b_c + self.dropout2(self.linear4(self.dropout(self.activation(self.linear3(l_hw_b_c)).clone())))
+        l_hw_b_c = self.norm2(l_hw_b_c)
         l = torch.cat((l_hw_b_c, g_hw_b_c), 1)  # hw,b(5),c
         return rearrange(l, "(h w) b c -> b c h w", h=h, w=w)  ## (5,c,h*w)
     def forward(self, x):
         device = x.device
+        b, c, h, w = x.size()
         loc, glb = x.split([4, 1], dim=0)  # 4,c,h,w; 1,c,h,w
         patched_glb = rearrange(glb, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
         token_attention_map = self.sigmoid(self.sal_conv(glb))
         token_attention_map = F.interpolate(token_attention_map, size=patches2image(loc).shape[-2:], mode='nearest')
         loc = loc * rearrange(token_attention_map, 'b c (hg h) (wg w) -> (hg wg b) c h w', hg=2, wg=2)
         pools = []
         for pool_ratio in self.pool_ratios:
             tgt_hw = (round(h / pool_ratio), round(w / pool_ratio))
             pool = F.adaptive_avg_pool2d(patched_glb, tgt_hw)
             pools.append(rearrange(pool, 'nl c h w -> nl c (h w)'))  # nl(4),c,hw
         pools = rearrange(torch.cat(pools, 2), "nl c nphw -> nl nphw 1 c")
         loc_ = rearrange(loc, 'nl c h w -> nl (h w) 1 c')
         outputs = []
         for i, q in enumerate(loc_.unbind(dim=0)):  # traverse all local patches
             v = pools[i]
             k = v
             outputs.append(self.attention[i](q, k, v)[0])
+        outputs = torch.cat(outputs, 1)
         src = loc.view(4, c, -1).permute(2, 0, 1) + self.dropout1(outputs)
         src = self.norm1(src)
         src = src + self.dropout2(self.linear4(self.dropout(self.activation(self.linear3(src)).clone())))
         src = self.norm2(src)
         src = src.permute(1, 2, 0).reshape(4, c, h, w)  # freshed loc
         glb = glb + F.interpolate(patches2image(src), size=glb.shape[-2:], mode='nearest')  # freshed glb
         return torch.cat((src, glb), 0), token_attention_map
+class BEN_Base(nn.Module):
     def __init__(self):
         super().__init__()
         e5 = self.multifieldcrossatt(loc_e5, glb_e5)  # (4,128,16,16)
         e4, tokenattmap4 = self.dec_blk4(e4 + resize_as(e5, e4))
+        e4 = self.conv4(e4)
         e3, tokenattmap3 = self.dec_blk3(e3 + resize_as(e4, e3))
         e3 = self.conv3(e3)
         e2, tokenattmap2 = self.dec_blk2(e2 + resize_as(e3, e2))
         return blurred_mask, foreground
     def loadcheckpoints(self,model_path):
+        model_dict = torch.load(model_path, map_location="cpu", weights_only=True)
         self.load_state_dict(model_dict['model_state_dict'], strict=True)
         del model_path
 def rgb_loader_refiner( original_image):
         # Convert to RGB if necessary
         if image.mode != 'RGB':
             image = image.convert('RGB')
         # Resize the image
         image = image.resize((1024, 1024), resample=Image.LANCZOS)
+        return image.convert('RGB'), h, w,original_image
 # Define the image transformation
 img_transform = transforms.Compose([
     transforms.ToTensor(),
+    transforms.ConvertImageDtype(torch.float32),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])