ssd检测不到怎么办 ssd检测( 五 )


c_conv10_2 = c_conv10_2.view(batch_size, -1, self.n_classes) # (N, 36, n_classes)
c_conv11_2 = self.cl_conv11_2(conv11_2_feats) # (N, 4 * n_classes, 1, 1)
c_conv11_2 = c_conv11_2.permute(0, 2, 3, 1).contiguous() # (N, 1, 1, 4 * n_classes)
c_conv11_2 = c_conv11_2.view(batch_size, -1, self.n_classes) # (N, 4, n_classes)
# A total of 8732 boxes
# Concatenate in this specific order
locs = torch.cat([l_conv4_3, l_conv7, l_conv8_2, l_conv9_2, l_conv10_2, l_conv11_2], dim=1) # (N, 8732, 4)
classes_scores = torch.cat([c_conv4_3, c_conv7, c_conv8_2, c_conv9_2, c_conv10_2, c_conv11_2], dim=1) # (N, 8732, n_classes)
return locs, classes_scores
这可能看起来很复杂,但它基本上获得了我们从基础 VGG-16 和辅助卷积中获得的所有特征图,并应用卷积层来预测每个特征图的类别和边界框 。
组合成完整代码
现在让我们把它们放在一起,看看最终的架构,如下所示 。
class SSD300(nn.Module):
"""
The SSD300 network – encapsulates the base VGG network, auxiliary, and prediction convolutions.
"""
def __init__(self, n_classes, device):
super(SSD300, self).__init__()
self.n_classes = n_classes
self.device = device
self.base = VGGBase()
self.aux_convs = AuxiliaryConvolutions()
self.pred_convs = PredictionConvolutions(n_classes)
# Since lower level features (conv4_3_feats) have considerably larger scales, we take the L2 norm and rescale
# Rescale factor is initially set at 20, but is learned for each channel during back-prop
self.rescale_factors = nn.Parameter(torch.FloatTensor(1, 512, 1, 1)) # there are 512 channels in conv4_3_feats
nn.init.constant_(self.rescale_factors, 20)
# Prior boxes
self.priors_cxcy = self.create_prior_boxes()
self.to(device)
def forward(self, image):
"""
Forward propagation.
:param image: images, a tensor of dimensions (N, 3, 300, 300)
:return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image
"""
# Run VGG base network convolutions
conv4_3_feats, conv7_feats = self.base(image) # (N, 512, 38, 38), (N, 1024, 19, 19)
# Rescale conv4_3 after L2 norm
norm = conv4_3_feats.pow(2).sum(dim=1, keepdim=True).sqrt() # (N, 1, 38, 38)
conv4_3_feats = conv4_3_feats / norm # (N, 512, 38, 38)
conv4_3_feats = conv4_3_feats * self.rescale_factors # (N, 512, 38, 38)
# Run auxiliary convolutions
# (N, 512, 10, 10), (N, 256, 5, 5), (N, 256, 3, 3), (N, 256, 1, 1)
conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.aux_convs(conv7_feats)
# Run prediction convolutions
# (N, 8732, 4), (N, 8732, n_classes)
locs, classes_scores = self.pred_convs(conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
return locs, classes_scores
请注意,较低级别的特征(conv4_3_feats)具有相当大的尺度,因此我们采用 L2 范数并重新调整它 。重新缩放因子最初设置为 20,但在反向传播期间为每个通道学习 。
损失函数
可以看出,定位损失是 L1 平滑损失,而分类损失是众所周知的交叉熵损失 。
匹配策略
在训练期间,我们需要确定哪些生成的先验框应该与我们要包含在损失计算中的地面实况框相对应 。因此,我们将每个真实框与具有最高 Jaccard 重叠的先验框进行匹配 。此外,我们还选择了重叠至少为 0.5 的先验框,以允许网络预测多个重叠框的高分 。
在匹配步骤之后,大多数先验/默认框用作负样本 。然而,为了避免正负样本之间的不平衡,我们最多保持 3:1 的比例,因为这样可以更快地优化和稳定学习 。再一次,定位损失仅在正(非背景)先验上计算 。

推荐阅读