目标检测与分割
目标检测概述
目标检测任务:
输入图像 → 检测框 + 类别标签
IoU (Intersection over Union):
┌─────────────┐
│ ┌───┐ │
│ │ A │ │
│ └─┬─┘ │
│ └───┐ │
│ │ B │ │
│ └───┘ │
└─────────────┘
IoU = Area(A∩B) / Area(A∪B)两阶段检测器 (Two-Stage)
R-CNN 系列
R-CNN: Selective Search → 2000 候选框 → CNN 特征 → SVM 分类
↓
Fast R-CNN: 整图 CNN → ROI Pooling → 全连接 → 分类+回归
↓
Faster R-CNN: RPN (区域提议网络) → ROI Pooling → 分类+回归Faster R-CNN
python
class FasterRCNN(nn.Module):
def __init__(self, backbone, num_classes):
super().__init__()
self.backbone = backbone # ResNet
self.rpn = RPN() # 区域提议网络
self.roi_pool = ROIAlign(pool_size=7, spatial_scale=1/16)
self.head = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(),
nn.Linear(4096, 4096),
nn.ReLU()
)
self.cls_head = nn.Linear(4096, num_classes + 1) # +1 background
self.reg_head = nn.Linear(4096, num_classes * 4) # 4 coords per class
def forward(self, x):
# Backbone 特征
features = self.backbone(x)
# RPN 生成提议
proposals, rpn_loss = self.rpn(features, targets)
# ROI Pooling
roi_features = self.roi_pool(features, proposals)
# 分类和回归
cls_scores = self.cls_head(self.head(roi_features))
bbox_pred = self.reg_head(self.head(roi_features))
return cls_scores, bbox_pred, rpn_lossRPN (Region Proposal Network)
python
class RPN(nn.Module):
def __init__(self, in_channels=1024):
super().__init__()
self.conv = nn.Conv2d(in_channels, 512, 3, 1, 1)
self.cls_logits = nn.Conv2d(512, 18, 1) # 9 anchors × 2 (obj/noobj)
self.bbox_pred = nn.Conv2d(512, 36, 1) # 9 anchors × 4 (dx,dy,dw,dh)
def forward(self, features, image_shape):
x = F.relu(self.conv(features))
# 9 种 anchor (3 scales × 3 ratios)
# scales: [128, 256, 512]
# ratios: [0.5, 1.0, 2.0]
objectness = self.cls_logits(x) # (batch, 9*2, H, W)
rpn_boxes = self.bbox_pred(x) # (batch, 9*4, H, W)
# 生成提议
proposals = self.generate_proposals(rpn_boxes, objectness)
return proposals单阶段检测器 (One-Stage)
YOLO (You Only Look Once)
python
class YOLOv3(nn.Module):
def __init__(self, num_classes=80):
super().__init__()
self.backbone = Darknet53()
self.yolo1 = YOLOLayer(32, num_classes, stride=8) # 大物体
self.yolo2 = YOLOLayer(32, num_classes, stride=16) # 中物体
self.yolo3 = YOLOLayer(32, num_classes, stride=32) # 小物体
def forward(self, x):
x = self.backbone(x)
yolo1_out = self.yolo1(x)
yolo2_out = self.yolo2(x)
yolo3_out = self.yolo3(x)
return [yolo1_out, yolo2_out, yolo3_out]
class YOLOLayer(nn.Module):
def __init__(self, num_anchors, num_classes, stride):
super().__init__()
self.num_anchors = num_anchors
self.num_classes = num_classes
self.stride = stride
# 每个格子: 5 + num_classes (tx,ty,tw,th,objectness, class_probs)
out_channels = num_anchors * (5 + num_classes)
self.conv = nn.Conv2d(1024, out_channels, 1)
def forward(self, x):
# x: (batch, 1024, H, W)
out = self.conv(x)
# out: (batch, num_anchors*(5+num_classes), H, W)
bs, _, h, w = out.shape
out = out.view(bs, self.num_anchors, -1, h, w)
out = out.permute(0, 1, 3, 4, 2) # (batch, anchors, H, W, 5+classes)
return outYOLO 输出格式
每个格子预测:
┌─────────────────────────────────────┐
│ [tx, ty, tw, th, p0, p1, p2, ...] │
│ 坐标 物体 类别概率 │
│ 中心 概率 │
└─────────────────────────────────────┘
实际边界框:
bx = sigmoid(tx) + cx # 中心 x
by = sigmoid(ty) + cy # 中心 y
bw = pw * exp(tw) # 宽度
bh = ph * exp(th) # 高度
归一化到 [0, 1]SSD (Single Shot MultiBox Detector)
python
# 多尺度特征图 + 默认框
class SSD300(nn.Module):
def __init__(self, num_classes=21):
super().__init__()
self.base_network = VGG16()
self.extras = ExtraLayers() # 额外特征层
# 多尺度特征图
self.loc = nn.ModuleList([
nn.Conv2d(512, 4*4, 3, 1, 1), # 38×38
nn.Conv2d(1024, 6*4, 3, 1, 1), # 19×19
nn.Conv2d(512, 6*4, 3, 1, 1), # 10×10
nn.Conv2d(256, 6*4, 3, 1, 1), # 5×5
nn.Conv2d(256, 4*4, 3, 1, 1), # 3×3
nn.Conv2d(256, 4*4, 3, 1, 1), # 1×1
])
self.cls = nn.ModuleList([
nn.Conv2d(512, 4*num_classes, 3, 1, 1),
# ...
])Anchor-Free 检测器
CenterNet
python
# 不需要 anchor,直接预测中心点和尺寸
class CenterNet(nn.Module):
def __init__(self, num_classes=80):
super().__init__()
self.backbone = HourglassNetwork()
# 输出 heatmap, size, offset
self.hm_head = nn.Sequential(
nn.Conv2d(64, 256, 3, 1, 1),
nn.ReLU(),
nn.Conv2d(256, num_classes, 1)
) # heatmap: (batch, num_classes, H, W)
self.wh_head = nn.Conv2d(256, 2, 1) # 宽高
self.offset_head = nn.Conv2d(256, 2, 1) # 中心偏移
def forward(self, x):
feat = self.backbone(x)
hm = torch.sigmoid(self.hm_head(feat)) # 中心点 heatmap
wh = self.wh_head(feat) # 宽高
offset = self.offset_head(feat) # 偏移
return hm, wh, offsetFCOS (Fully Convolutional One-Stage)
python
# 每个像素预测,预测到四个边的距离
class FCOS(nn.Module):
def forward(self, features, targets):
for feat in features:
# 每个位置预测: (cls, reg)
cls_logits = self.cls_head(feat) # (N, C, H, W)
reg_pred = self.reg_head(feat) # (N, 4, H, W)
center_pred = self.center_head(feat) # (N, 1, H, W)评价指标
python
# mAP (mean Average Precision)
# 1. 对每个类别,计算 AP
# 2. AP = Precision-Recall 曲线下面积
# 3. mAP = 所有类别的平均 AP
# IoU 阈值
# mAP@0.5: IoU>0.5 即为正
# mAP@0.5:0.95: IoU 从 0.5 到 0.95 的平均
# FPS (Frames Per Second)
# 检测速度语义分割
FCN (Fully Convolutional Network)
python
# 全连接层 → 卷积层
class FCN8s(nn.Module):
def __init__(self, num_classes=21):
super().__init__()
# encoder
self.conv1 = nn.Conv2d(3, 64, 7, padding=3)
self.pool1 = nn.MaxPool2d(2)
# ... 继续 VGG 结构
# decoder
self.upscore2 = nn.ConvTranspose2d(4096, num_classes, 4, 2) # 2x upsample
self.upscore3 = nn.ConvTranspose2d(num_classes, num_classes, 4, 2) # 2x upsample
# skip connections 融合浅层特征
self.score_pool4 = nn.Conv2d(1024, num_classes, 1)
self.score_pool3 = nn.Conv2d(512, num_classes, 1)
def forward(self, x):
h = self.pool1(F.relu(self.conv1(x)))
# ... encoder
# decoder with skip connections
h = self.upscore2(h)
h = h + self.score_pool4(pool4)
h = self.upscore3(h)
h = h + self.score_pool3(pool3)
return hU-Net
python
class UNet(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
# Encoder
self.enc1 = DoubleConv(in_channels, 64)
self.enc2 = DoubleConv(64, 128)
self.enc3 = DoubleConv(128, 256)
self.enc4 = DoubleConv(256, 512)
# Bottleneck
self.bottleneck = DoubleConv(512, 1024)
# Decoder
self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
self.dec4 = DoubleConv(1024, 512)
self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
self.dec3 = DoubleConv(512, 256)
# ...
self.out = nn.Conv2d(64, out_channels, 1)
def forward(self, x):
# Encoder
e1 = self.enc1(x)
e2 = self.enc2(F.max_pool2d(e1, 2))
e3 = self.enc3(F.max_pool2d(e2, 2))
e4 = self.enc4(F.max_pool2d(e3, 2))
# Bottleneck
b = self.bottleneck(F.max_pool2d(e4, 2))
# Decoder with skip connections
d4 = self.dec4(torch.cat([self.up4(b), e4], dim=1))
d3 = self.dec3(torch.cat([self.up3(d4), e3], dim=1))
d2 = self.dec2(torch.cat([self.up2(d3), e2], dim=1))
d1 = self.dec1(torch.cat([self.up1(d2), e1], dim=1))
return self.out(d1)DeepLab 系列
python
# DeepLabv3+: ASPP (Atrous Spatial Pyramid Pooling)
class ASPP(nn.Module):
def __init__(self, in_channels, out_channels, atrous_rates=[6, 12, 18]):
super().__init__()
self.convs = nn.ModuleList([
nn.Conv2d(in_channels, out_channels, 1),
nn.Conv2d(in_channels, out_channels, 3, padding=r, dilation=r)
for r in atrous_rates
])
self.global_pool = nn.AdaptiveAvgPool2d(1)
self.out = nn.Conv2d(out_channels * (len(atrous_rates)+1), out_channels, 1)
def forward(self, x):
res = [x]
for conv in self.convs:
res.append(conv(x))
res.append(F.interpolate(self.global_pool(x), x.shape[2:]))
return self.out(torch.cat(res, dim=1))实例分割
Mask R-CNN
python
class MaskRCNN(nn.Module):
def __init__(self, num_classes):
super().__init__()
self.backbone = ResNet()
self.rpn = RPN()
self.roi_pool = ROIAlign(pool_size=7, spatial_scale=1/16)
# 分类和回归头
self.box_head = nn.Sequential(
nn.Linear(256*7*7, 1024),
nn.ReLU(),
nn.Linear(1024, 1024)
)
self.box_cls = nn.Linear(1024, num_classes)
self.box_reg = nn.Linear(1024, num_classes*4)
# 掩码头 (Mask Branch)
self.mask_head = nn.Sequential(
nn.ConvTranspose2d(256, 256, 2, 2),
nn.ConvTranspose2d(256, 128, 2, 2),
nn.ConvTranspose2d(128, 64, 2, 2),
nn.ConvTranspose2d(64, 32, 2, 2),
nn.Conv2d(32, num_classes, 1) # 每个类别一个 mask
)
def forward(self, x, targets=None):
features = self.backbone(x)
proposals, rpn_loss = self.rpn(features, targets)
# ROI pooling
roi_features = self.roi_pool(features, proposals)
# 分类和回归
box_feat = self.box_head(roi_features)
cls_scores = self.box_cls(box_feat)
box_deltas = self.box_reg(box_feat)
# 掩码生成
mask_logits = self.mask_head(roi_features)
return {
'cls_scores': cls_scores,
'box_deltas': box_deltas,
'mask_logits': mask_logits,
'rpn_loss': rpn_loss
}分割评价指标
python
# Pixel Accuracy
pixel_acc = (TP + TN) / (TP + TN + FP + FN)
# Mean IoU
mean_iou = np.mean([iou per class])
# Dice Coefficient (F1)
dice = 2 * |A ∩ B| / (|A| + |B|)
# Boundary IoU (医学图像)
boundary_iou = intersection / union (在边界附近)
# PA (Panoptic Quality)
# 实例分割: PQ = Σ(|TP| * IoU) / (|TP| + 0.5|FP| + 0.5|FN|)模型对比
| 模型 | AP | FPS | 特点 |
|---|---|---|---|
| Faster R-CNN | 高 | 低 | 两阶段,精度高 |
| YOLOv5 | 中 | 高 | 单阶段,平衡 |
| YOLOv8 | 高 | 高 | 最新,改进 |
| RetinaNet | 中 | 中 | Focal Loss 解决类别不平衡 |
| FCOS | 中 | 高 | Anchor-free |
| CenterNet | 中 | 高 | Anchor-free,Center |
面试要点
1. One-Stage vs Two-Stage
- Two-Stage: 精度高,速度慢(R-CNN系列)
- One-Stage: 速度快,精度稍差(YOLO、SSD)
- Anchor-based vs Anchor-free
2. NMS (Non-Maximum Suppression) 原理
- 按置信度排序
- 抑制重叠大的框(IoU > threshold)
- 保留最高置信度的框
3. IoU 计算
IoU = intersection / union
4. mAP 计算
- PR 曲线下面积
- 对每个类别分别计算
5. 语义分割 vs 实例分割 vs 全景分割
- 语义: 像素级分类,不区分同类不同个体
- 实例: 区分同类不同个体
- 全景: 语义 + 实例
6. 常用 Backbone
- ResNet, VGG
- CSPDarknet (YOLO)
- HRNet (保持高分辨率)
7. 类别不平衡解决方案
- Focal Loss
- OHEM (在线困难样本挖掘)
- Class weighting