Balancing between obj and noobj part of confidence loss. Computation …

…of mAP by class-wise average of APs.
mvcaro · Oct 1, 2018 · 38e2ecc · 38e2ecc
1 parent 959e0ff
commit 38e2ecc
Show file tree

Hide file tree

Showing 4 changed files with 370 additions and 238 deletions.
diff --git a/models.py b/models.py
@@ -15,109 +15,120 @@
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 
+
 def create_modules(module_defs):
     """
     Constructs module list of layer blocks from module configuration in module_defs
     """
     hyperparams = module_defs.pop(0)
-    output_filters = [int(hyperparams['channels'])]
+    output_filters = [int(hyperparams["channels"])]
     module_list = nn.ModuleList()
     for i, module_def in enumerate(module_defs):
         modules = nn.Sequential()
 
-        if module_def['type'] == 'convolutional':
-            bn = int(module_def['batch_normalize'])
-            filters = int(module_def['filters'])
-            kernel_size = int(module_def['size'])
-            pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0
-            modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1],
-                                                        out_channels=filters,
-                                                        kernel_size=kernel_size,
-                                                        stride=int(module_def['stride']),
-                                                        padding=pad,
-                                                        bias=not bn))
+        if module_def["type"] == "convolutional":
+            bn = int(module_def["batch_normalize"])
+            filters = int(module_def["filters"])
+            kernel_size = int(module_def["size"])
+            pad = (kernel_size - 1) // 2 if int(module_def["pad"]) else 0
+            modules.add_module(
+                "conv_%d" % i,
+                nn.Conv2d(
+                    in_channels=output_filters[-1],
+                    out_channels=filters,
+                    kernel_size=kernel_size,
+                    stride=int(module_def["stride"]),
+                    padding=pad,
+                    bias=not bn,
+                ),
+            )
             if bn:
-                modules.add_module('batch_norm_%d' % i, nn.BatchNorm2d(filters))
-            if module_def['activation'] == 'leaky':
-                modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1))
+                modules.add_module("batch_norm_%d" % i, nn.BatchNorm2d(filters))
+            if module_def["activation"] == "leaky":
+                modules.add_module("leaky_%d" % i, nn.LeakyReLU(0.1))
 
-        elif module_def['type'] == 'upsample':
-            upsample = nn.Upsample( scale_factor=int(module_def['stride']),
-                                    mode='nearest')
-            modules.add_module('upsample_%d' % i, upsample)
+        elif module_def["type"] == "upsample":
+            upsample = nn.Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
+            modules.add_module("upsample_%d" % i, upsample)
 
-        elif module_def['type'] == 'route':
-            layers = [int(x) for x in module_def["layers"].split(',')]
+        elif module_def["type"] == "route":
+            layers = [int(x) for x in module_def["layers"].split(",")]
             filters = sum([output_filters[layer_i] for layer_i in layers])
-            modules.add_module('route_%d' % i, EmptyLayer())
+            modules.add_module("route_%d" % i, EmptyLayer())
 
-        elif module_def['type'] == 'shortcut':
-            filters = output_filters[int(module_def['from'])]
+        elif module_def["type"] == "shortcut":
+            filters = output_filters[int(module_def["from"])]
             modules.add_module("shortcut_%d" % i, EmptyLayer())
 
         elif module_def["type"] == "yolo":
             anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
             # Extract anchors
             anchors = [int(x) for x in module_def["anchors"].split(",")]
-            anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
+            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
             anchors = [anchors[i] for i in anchor_idxs]
-            num_classes = int(module_def['classes'])
-            img_height = int(hyperparams['height'])
+            num_classes = int(module_def["classes"])
+            img_height = int(hyperparams["height"])
             # Define detection layer
             yolo_layer = YOLOLayer(anchors, num_classes, img_height)
-            modules.add_module('yolo_%d' % i, yolo_layer)
+            modules.add_module("yolo_%d" % i, yolo_layer)
         # Register module list and number of output filters
         module_list.append(modules)
         output_filters.append(filters)
 
     return hyperparams, module_list
 
+
 class EmptyLayer(nn.Module):
     """Placeholder for 'route' and 'shortcut' layers"""
+
     def __init__(self):
         super(EmptyLayer, self).__init__()
 
+
 class YOLOLayer(nn.Module):
     """Detection layer"""
+
     def __init__(self, anchors, num_classes, img_dim):
         super(YOLOLayer, self).__init__()
         self.anchors = anchors
         self.num_anchors = len(anchors)
         self.num_classes = num_classes
         self.bbox_attrs = 5 + num_classes
-        self.img_dim = img_dim
+        self.image_dim = img_dim
         self.ignore_thres = 0.5
         self.lambda_coord = 1
 
-        self.mse_loss = nn.MSELoss()
-        self.bce_loss = nn.BCELoss()
+        self.mse_loss = nn.MSELoss(size_average=True)  # Coordinate loss
+        self.bce_loss = nn.BCELoss(size_average=True)  # Confidence loss
+        self.ce_loss = nn.CrossEntropyLoss()  # Class loss
 
     def forward(self, x, targets=None):
-        bs = x.size(0)
-        g_dim = x.size(2)
-        stride =  self.img_dim / g_dim
+        nA = self.num_anchors
+        nB = x.size(0)
+        nG = x.size(2)
+        stride = self.image_dim / nG
+
         # Tensors for cuda support
         FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
         LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
 
-        prediction = x.view(bs,  self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous()
+        prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous()
 
         # Get outputs
-        x = torch.sigmoid(prediction[..., 0])          # Center x
-        y = torch.sigmoid(prediction[..., 1])          # Center y
-        w = prediction[..., 2]                         # Width
-        h = prediction[..., 3]                         # Height
-        conf = torch.sigmoid(prediction[..., 4])       # Conf
+        x = torch.sigmoid(prediction[..., 0])  # Center x
+        y = torch.sigmoid(prediction[..., 1])  # Center y
+        w = prediction[..., 2]  # Width
+        h = prediction[..., 3]  # Height
+        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
         pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
 
         # Calculate offsets for each grid
-        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
-        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
-        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]
-        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
-        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
-        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape)
-        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape)
+        grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor)
+        grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor)
+        scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors])
+        anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1))
+        anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1))
 
         # Add offset and scale with anchors
         pred_boxes = FloatTensor(prediction[..., :4].shape)
@@ -133,74 +144,100 @@ def forward(self, x, targets=None):
                 self.mse_loss = self.mse_loss.cuda()
                 self.bce_loss = self.bce_loss.cuda()
 
-            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,
-                                                                            targets.cpu().data,
-                                                                            scaled_anchors,
-                                                                            self.num_anchors,
-                                                                            self.num_classes,
-                                                                            g_dim,
-                                                                            self.ignore_thres,
-                                                                            self.img_dim)
-
-            nProposals = int((conf > 0.25).sum().item())
+            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(
+                pred_boxes=pred_boxes.cpu().data,
+                pred_conf=pred_conf.cpu().data,
+                pred_cls=pred_cls.cpu().data,
+                target=targets.cpu().data,
+                anchors=scaled_anchors.cpu().data,
+                num_anchors=nA,
+                num_classes=self.num_classes,
+                grid_size=nG,
+                ignore_thres=self.ignore_thres,
+                img_dim=self.image_dim,
+            )
+
+            nProposals = int((pred_conf > 0.25).sum().item())
             recall = float(nCorrect / nGT) if nGT else 1
 
             # Handle masks
-            mask = Variable(mask.type(FloatTensor))
-            cls_mask = Variable(mask.unsqueeze(-1).repeat(1, 1, 1, 1, self.num_classes).type(FloatTensor))
-            conf_mask = Variable(conf_mask.type(FloatTensor))
+            mask = Variable(mask.type(ByteTensor))
+            conf_mask = Variable(conf_mask.type(ByteTensor))
 
             # Handle target variables
-            tx    = Variable(tx.type(FloatTensor), requires_grad=False)
-            ty    = Variable(ty.type(FloatTensor), requires_grad=False)
-            tw    = Variable(tw.type(FloatTensor), requires_grad=False)
-            th    = Variable(th.type(FloatTensor), requires_grad=False)
+            tx = Variable(tx.type(FloatTensor), requires_grad=False)
+            ty = Variable(ty.type(FloatTensor), requires_grad=False)
+            tw = Variable(tw.type(FloatTensor), requires_grad=False)
+            th = Variable(th.type(FloatTensor), requires_grad=False)
             tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
-            tcls  = Variable(tcls.type(FloatTensor), requires_grad=False)
+            tcls = Variable(tcls.type(LongTensor), requires_grad=False)
+
+            # Get conf mask where gt and where there is no gt
+            conf_mask_true = mask
+            conf_mask_false = conf_mask - mask
 
             # Mask outputs to ignore non-existing objects
-            loss_x = self.lambda_coord * self.bce_loss(x * mask, tx * mask)
-            loss_y = self.lambda_coord * self.bce_loss(y * mask, ty * mask)
-            loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2
-            loss_h = self.lambda_coord * self.mse_loss(h * mask, th * mask) / 2
-            loss_conf = self.bce_loss(conf * conf_mask, tconf * conf_mask)
-            loss_cls = self.bce_loss(pred_cls * cls_mask, tcls * cls_mask)
+            loss_x = self.mse_loss(x[mask], tx[mask])
+            loss_y = self.mse_loss(y[mask], ty[mask])
+            loss_w = self.mse_loss(w[mask], tw[mask])
+            loss_h = self.mse_loss(h[mask], th[mask])
+            loss_conf = (1 / conf_mask_false.sum().float()) * self.bce_loss(
+                pred_conf[conf_mask_false], tconf[conf_mask_false]
+            ) + (1 / conf_mask_true.sum().float()) * self.bce_loss(pred_conf[conf_mask_true], tconf[conf_mask_true])
+            loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1))
             loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
 
-            return loss, loss_x.item(), loss_y.item(), loss_w.item(), loss_h.item(), loss_conf.item(), loss_cls.item(), recall
+            return (
+                loss,
+                loss_x.item(),
+                loss_y.item(),
+                loss_w.item(),
+                loss_h.item(),
+                loss_conf.item(),
+                loss_cls.item(),
+                recall,
+            )
 
         else:
             # If not in training phase return predictions
-            output = torch.cat((pred_boxes.view(bs, -1, 4) * stride, conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1)
-            return output.data
+            output = torch.cat(
+                (
+                    pred_boxes.view(nB, -1, 4) * stride,
+                    pred_conf.view(nB, -1, 1),
+                    pred_cls.view(nB, -1, self.num_classes),
+                ),
+                -1,
+            )
+            return output
 
 
 class Darknet(nn.Module):
     """YOLOv3 object detection model"""
+
     def __init__(self, config_path, img_size=416):
         super(Darknet, self).__init__()
         self.module_defs = parse_model_config(config_path)
         self.hyperparams, self.module_list = create_modules(self.module_defs)
         self.img_size = img_size
         self.seen = 0
         self.header_info = np.array([0, 0, 0, self.seen, 0])
-        self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall']
+        self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall"]
 
     def forward(self, x, targets=None):
         is_training = targets is not None
         output = []
         self.losses = defaultdict(float)
         layer_outputs = []
         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
-            if module_def['type'] in ['convolutional', 'upsample']:
+            if module_def["type"] in ["convolutional", "upsample"]:
                 x = module(x)
-            elif module_def['type'] == 'route':
-                layer_i = [int(x) for x in module_def['layers'].split(',')]
+            elif module_def["type"] == "route":
+                layer_i = [int(x) for x in module_def["layers"].split(",")]
                 x = torch.cat([layer_outputs[i] for i in layer_i], 1)
-            elif module_def['type'] == 'shortcut':
-                layer_i = int(module_def['from'])
+            elif module_def["type"] == "shortcut":
+                layer_i = int(module_def["from"])
                 x = layer_outputs[-1] + layer_outputs[layer_i]
-            elif module_def['type'] == 'yolo':
+            elif module_def["type"] == "yolo":
                 # Train phase: get loss
                 if is_training:
                     x, *losses = module[0](x, targets)
@@ -212,76 +249,76 @@ def forward(self, x, targets=None):
                 output.append(x)
             layer_outputs.append(x)
 
-        self.losses['recall'] /= 3
+        self.losses["recall"] /= 3
         return sum(output) if is_training else torch.cat(output, 1)
 
-
     def load_weights(self, weights_path):
         """Parses and loads the weights stored in 'weights_path'"""
 
-        #Open the weights file
+        # Open the weights file
         fp = open(weights_path, "rb")
-        header = np.fromfile(fp, dtype=np.int32, count=5)   # First five are header values
+        header = np.fromfile(fp, dtype=np.int32, count=5)  # First five are header values
 
         # Needed to write header when saving weights
         self.header_info = header
 
         self.seen = header[3]
-        weights = np.fromfile(fp, dtype=np.float32)         # The rest are weights
+        weights = np.fromfile(fp, dtype=np.float32)  # The rest are weights
         fp.close()
 
         ptr = 0
         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
-            if module_def['type'] == 'convolutional':
+            if module_def["type"] == "convolutional":
                 conv_layer = module[0]
-                if module_def['batch_normalize']:
+                if module_def["batch_normalize"]:
                     # Load BN bias, weights, running mean and running variance
                     bn_layer = module[1]
-                    num_b = bn_layer.bias.numel() # Number of biases
+                    num_b = bn_layer.bias.numel()  # Number of biases
                     # Bias
-                    bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias)
+                    bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
                     bn_layer.bias.data.copy_(bn_b)
                     ptr += num_b
                     # Weight
-                    bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight)
+                    bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
                     bn_layer.weight.data.copy_(bn_w)
                     ptr += num_b
                     # Running Mean
-                    bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
+                    bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
                     bn_layer.running_mean.data.copy_(bn_rm)
                     ptr += num_b
                     # Running Var
-                    bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
+                    bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
                     bn_layer.running_var.data.copy_(bn_rv)
                     ptr += num_b
                 else:
                     # Load conv. bias
                     num_b = conv_layer.bias.numel()
-                    conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias)
+                    conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
                     conv_layer.bias.data.copy_(conv_b)
                     ptr += num_b
                 # Load conv. weights
                 num_w = conv_layer.weight.numel()
-                conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)
+                conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
                 conv_layer.weight.data.copy_(conv_w)
                 ptr += num_w
 
     """
         @:param path    - path of the new weights file
         @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
     """
+
     def save_weights(self, path, cutoff=-1):
 
-        fp = open(path, 'wb')
+        fp = open(path, "wb")
         self.header_info[3] = self.seen
         self.header_info.tofile(fp)
 
         # Iterate through layers
         for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
-            if module_def['type'] == 'convolutional':
+            if module_def["type"] == "convolutional":
                 conv_layer = module[0]
                 # If batch norm, load bn first
-                if module_def['batch_normalize']:
+                if module_def["batch_normalize"]:
                     bn_layer = module[1]
                     bn_layer.bias.data.cpu().numpy().tofile(fp)
                     bn_layer.weight.data.cpu().numpy().tofile(fp)