Skip to content

目标检测与分割

目标检测概述

目标检测任务:
输入图像 → 检测框 + 类别标签

IoU (Intersection over Union):
┌─────────────┐
│    ┌───┐   │
│    │ A │   │
│    └─┬─┘   │
│      └───┐ │
│      │ B │ │
│      └───┘ │
└─────────────┘

IoU = Area(A∩B) / Area(A∪B)

两阶段检测器 (Two-Stage)

R-CNN 系列

R-CNN: Selective Search → 2000 候选框 → CNN 特征 → SVM 分类

Fast R-CNN: 整图 CNN → ROI Pooling → 全连接 → 分类+回归

Faster R-CNN: RPN (区域提议网络) → ROI Pooling → 分类+回归

Faster R-CNN

python
class FasterRCNN(nn.Module):
    def __init__(self, backbone, num_classes):
        super().__init__()
        self.backbone = backbone  # ResNet
        self.rpn = RPN()          # 区域提议网络
        self.roi_pool = ROIAlign(pool_size=7, spatial_scale=1/16)
        self.head = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(),
            nn.Linear(4096, 4096),
            nn.ReLU()
        )
        self.cls_head = nn.Linear(4096, num_classes + 1)  # +1 background
        self.reg_head = nn.Linear(4096, num_classes * 4)  # 4 coords per class
    
    def forward(self, x):
        # Backbone 特征
        features = self.backbone(x)
        
        # RPN 生成提议
        proposals, rpn_loss = self.rpn(features, targets)
        
        # ROI Pooling
        roi_features = self.roi_pool(features, proposals)
        
        # 分类和回归
        cls_scores = self.cls_head(self.head(roi_features))
        bbox_pred = self.reg_head(self.head(roi_features))
        
        return cls_scores, bbox_pred, rpn_loss

RPN (Region Proposal Network)

python
class RPN(nn.Module):
    def __init__(self, in_channels=1024):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, 512, 3, 1, 1)
        self.cls_logits = nn.Conv2d(512, 18, 1)  # 9 anchors × 2 (obj/noobj)
        self.bbox_pred = nn.Conv2d(512, 36, 1)   # 9 anchors × 4 (dx,dy,dw,dh)
    
    def forward(self, features, image_shape):
        x = F.relu(self.conv(features))
        
        # 9 种 anchor (3 scales × 3 ratios)
        # scales: [128, 256, 512]
        # ratios: [0.5, 1.0, 2.0]
        
        objectness = self.cls_logits(x)  # (batch, 9*2, H, W)
        rpn_boxes = self.bbox_pred(x)    # (batch, 9*4, H, W)
        
        # 生成提议
        proposals = self.generate_proposals(rpn_boxes, objectness)
        
        return proposals

单阶段检测器 (One-Stage)

YOLO (You Only Look Once)

python
class YOLOv3(nn.Module):
    def __init__(self, num_classes=80):
        super().__init__()
        self.backbone = Darknet53()
        self.yolo1 = YOLOLayer(32, num_classes, stride=8)   # 大物体
        self.yolo2 = YOLOLayer(32, num_classes, stride=16)  # 中物体
        self.yolo3 = YOLOLayer(32, num_classes, stride=32)  # 小物体
    
    def forward(self, x):
        x = self.backbone(x)
        
        yolo1_out = self.yolo1(x)
        yolo2_out = self.yolo2(x)
        yolo3_out = self.yolo3(x)
        
        return [yolo1_out, yolo2_out, yolo3_out]


class YOLOLayer(nn.Module):
    def __init__(self, num_anchors, num_classes, stride):
        super().__init__()
        self.num_anchors = num_anchors
        self.num_classes = num_classes
        self.stride = stride
        
        # 每个格子: 5 + num_classes (tx,ty,tw,th,objectness, class_probs)
        out_channels = num_anchors * (5 + num_classes)
        self.conv = nn.Conv2d(1024, out_channels, 1)
    
    def forward(self, x):
        # x: (batch, 1024, H, W)
        out = self.conv(x)
        # out: (batch, num_anchors*(5+num_classes), H, W)
        
        bs, _, h, w = out.shape
        out = out.view(bs, self.num_anchors, -1, h, w)
        out = out.permute(0, 1, 3, 4, 2)  # (batch, anchors, H, W, 5+classes)
        
        return out

YOLO 输出格式

每个格子预测:
┌─────────────────────────────────────┐
│ [tx, ty, tw, th, p0, p1, p2, ...]  │
│   坐标    物体    类别概率           │
│   中心    概率                   │
└─────────────────────────────────────┘

实际边界框:
bx = sigmoid(tx) + cx  # 中心 x
by = sigmoid(ty) + cy  # 中心 y
bw = pw * exp(tw)      # 宽度
bh = ph * exp(th)      # 高度

归一化到 [0, 1]

SSD (Single Shot MultiBox Detector)

python
# 多尺度特征图 + 默认框
class SSD300(nn.Module):
    def __init__(self, num_classes=21):
        super().__init__()
        self.base_network = VGG16()
        self.extras = ExtraLayers()  # 额外特征层
        
        # 多尺度特征图
        self.loc = nn.ModuleList([
            nn.Conv2d(512, 4*4, 3, 1, 1),    # 38×38
            nn.Conv2d(1024, 6*4, 3, 1, 1),   # 19×19
            nn.Conv2d(512, 6*4, 3, 1, 1),    # 10×10
            nn.Conv2d(256, 6*4, 3, 1, 1),    # 5×5
            nn.Conv2d(256, 4*4, 3, 1, 1),   # 3×3
            nn.Conv2d(256, 4*4, 3, 1, 1),   # 1×1
        ])
        
        self.cls = nn.ModuleList([
            nn.Conv2d(512, 4*num_classes, 3, 1, 1),
            # ...
        ])

Anchor-Free 检测器

CenterNet

python
# 不需要 anchor,直接预测中心点和尺寸
class CenterNet(nn.Module):
    def __init__(self, num_classes=80):
        super().__init__()
        self.backbone = HourglassNetwork()
        
        # 输出 heatmap, size, offset
        self.hm_head = nn.Sequential(
            nn.Conv2d(64, 256, 3, 1, 1),
            nn.ReLU(),
            nn.Conv2d(256, num_classes, 1)
        )  # heatmap: (batch, num_classes, H, W)
        
        self.wh_head = nn.Conv2d(256, 2, 1)  # 宽高
        self.offset_head = nn.Conv2d(256, 2, 1)  # 中心偏移
    
    def forward(self, x):
        feat = self.backbone(x)
        
        hm = torch.sigmoid(self.hm_head(feat))  # 中心点 heatmap
        wh = self.wh_head(feat)  # 宽高
        offset = self.offset_head(feat)  # 偏移
        
        return hm, wh, offset

FCOS (Fully Convolutional One-Stage)

python
# 每个像素预测,预测到四个边的距离
class FCOS(nn.Module):
    def forward(self, features, targets):
        for feat in features:
            # 每个位置预测: (cls, reg)
            cls_logits = self.cls_head(feat)  # (N, C, H, W)
            reg_pred = self.reg_head(feat)     # (N, 4, H, W)
            center_pred = self.center_head(feat)  # (N, 1, H, W)

评价指标

python
# mAP (mean Average Precision)
# 1. 对每个类别,计算 AP
# 2. AP = Precision-Recall 曲线下面积
# 3. mAP = 所有类别的平均 AP

# IoU 阈值
# mAP@0.5: IoU>0.5 即为正
# mAP@0.5:0.95: IoU 从 0.5 到 0.95 的平均

# FPS (Frames Per Second)
# 检测速度

语义分割

FCN (Fully Convolutional Network)

python
# 全连接层 → 卷积层
class FCN8s(nn.Module):
    def __init__(self, num_classes=21):
        super().__init__()
        # encoder
        self.conv1 = nn.Conv2d(3, 64, 7, padding=3)
        self.pool1 = nn.MaxPool2d(2)
        # ... 继续 VGG 结构
        
        # decoder
        self.upscore2 = nn.ConvTranspose2d(4096, num_classes, 4, 2)  # 2x upsample
        self.upscore3 = nn.ConvTranspose2d(num_classes, num_classes, 4, 2)  # 2x upsample
        
        # skip connections 融合浅层特征
        self.score_pool4 = nn.Conv2d(1024, num_classes, 1)
        self.score_pool3 = nn.Conv2d(512, num_classes, 1)
    
    def forward(self, x):
        h = self.pool1(F.relu(self.conv1(x)))
        # ... encoder
        
        # decoder with skip connections
        h = self.upscore2(h)
        h = h + self.score_pool4(pool4)
        h = self.upscore3(h)
        h = h + self.score_pool3(pool3)
        
        return h

U-Net

python
class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        
        # Encoder
        self.enc1 = DoubleConv(in_channels, 64)
        self.enc2 = DoubleConv(64, 128)
        self.enc3 = DoubleConv(128, 256)
        self.enc4 = DoubleConv(256, 512)
        
        # Bottleneck
        self.bottleneck = DoubleConv(512, 1024)
        
        # Decoder
        self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
        self.dec4 = DoubleConv(1024, 512)
        self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
        self.dec3 = DoubleConv(512, 256)
        # ...
        
        self.out = nn.Conv2d(64, out_channels, 1)
    
    def forward(self, x):
        # Encoder
        e1 = self.enc1(x)
        e2 = self.enc2(F.max_pool2d(e1, 2))
        e3 = self.enc3(F.max_pool2d(e2, 2))
        e4 = self.enc4(F.max_pool2d(e3, 2))
        
        # Bottleneck
        b = self.bottleneck(F.max_pool2d(e4, 2))
        
        # Decoder with skip connections
        d4 = self.dec4(torch.cat([self.up4(b), e4], dim=1))
        d3 = self.dec3(torch.cat([self.up3(d4), e3], dim=1))
        d2 = self.dec2(torch.cat([self.up2(d3), e2], dim=1))
        d1 = self.dec1(torch.cat([self.up1(d2), e1], dim=1))
        
        return self.out(d1)

DeepLab 系列

python
# DeepLabv3+: ASPP (Atrous Spatial Pyramid Pooling)
class ASPP(nn.Module):
    def __init__(self, in_channels, out_channels, atrous_rates=[6, 12, 18]):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels, out_channels, 1),
            nn.Conv2d(in_channels, out_channels, 3, padding=r, dilation=r)
            for r in atrous_rates
        ])
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.out = nn.Conv2d(out_channels * (len(atrous_rates)+1), out_channels, 1)
    
    def forward(self, x):
        res = [x]
        for conv in self.convs:
            res.append(conv(x))
        res.append(F.interpolate(self.global_pool(x), x.shape[2:]))
        return self.out(torch.cat(res, dim=1))

实例分割

Mask R-CNN

python
class MaskRCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = ResNet()
        self.rpn = RPN()
        self.roi_pool = ROIAlign(pool_size=7, spatial_scale=1/16)
        
        # 分类和回归头
        self.box_head = nn.Sequential(
            nn.Linear(256*7*7, 1024),
            nn.ReLU(),
            nn.Linear(1024, 1024)
        )
        self.box_cls = nn.Linear(1024, num_classes)
        self.box_reg = nn.Linear(1024, num_classes*4)
        
        # 掩码头 (Mask Branch)
        self.mask_head = nn.Sequential(
            nn.ConvTranspose2d(256, 256, 2, 2),
            nn.ConvTranspose2d(256, 128, 2, 2),
            nn.ConvTranspose2d(128, 64, 2, 2),
            nn.ConvTranspose2d(64, 32, 2, 2),
            nn.Conv2d(32, num_classes, 1)  # 每个类别一个 mask
        )
    
    def forward(self, x, targets=None):
        features = self.backbone(x)
        proposals, rpn_loss = self.rpn(features, targets)
        
        # ROI pooling
        roi_features = self.roi_pool(features, proposals)
        
        # 分类和回归
        box_feat = self.box_head(roi_features)
        cls_scores = self.box_cls(box_feat)
        box_deltas = self.box_reg(box_feat)
        
        # 掩码生成
        mask_logits = self.mask_head(roi_features)
        
        return {
            'cls_scores': cls_scores,
            'box_deltas': box_deltas,
            'mask_logits': mask_logits,
            'rpn_loss': rpn_loss
        }

分割评价指标

python
# Pixel Accuracy
pixel_acc = (TP + TN) / (TP + TN + FP + FN)

# Mean IoU
mean_iou = np.mean([iou per class])

# Dice Coefficient (F1)
dice = 2 * |A ∩ B| / (|A| + |B|)

# Boundary IoU (医学图像)
boundary_iou = intersection / union (在边界附近)

# PA (Panoptic Quality)
# 实例分割: PQ = Σ(|TP| * IoU) / (|TP| + 0.5|FP| + 0.5|FN|)

模型对比

模型APFPS特点
Faster R-CNN两阶段,精度高
YOLOv5单阶段,平衡
YOLOv8最新,改进
RetinaNetFocal Loss 解决类别不平衡
FCOSAnchor-free
CenterNetAnchor-free,Center

面试要点

1. One-Stage vs Two-Stage
   - Two-Stage: 精度高,速度慢(R-CNN系列)
   - One-Stage: 速度快,精度稍差(YOLO、SSD)
   - Anchor-based vs Anchor-free

2. NMS (Non-Maximum Suppression) 原理
   - 按置信度排序
   - 抑制重叠大的框(IoU > threshold)
   - 保留最高置信度的框

3. IoU 计算
   IoU = intersection / union

4. mAP 计算
   - PR 曲线下面积
   - 对每个类别分别计算

5. 语义分割 vs 实例分割 vs 全景分割
   - 语义: 像素级分类,不区分同类不同个体
   - 实例: 区分同类不同个体
   - 全景: 语义 + 实例

6. 常用 Backbone
   - ResNet, VGG
   - CSPDarknet (YOLO)
   - HRNet (保持高分辨率)

7. 类别不平衡解决方案
   - Focal Loss
   - OHEM (在线困难样本挖掘)
   - Class weighting

基于 MIT 许可发布