前言:为什么需要 Object Detection?
34.1 Object Detection 的重要性
车内物体检测在 IMS 中的应用:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| ┌─────────────────────────────────────────────────────────────────────────┐ │ Object Detection 在 IMS 中的应用 │ ├─────────────────────────────────────────────────────────────────────────┤ │ │ │ 检测目标: │ │ ├── 手机、水杯、行李等车内物品 │ │ ├── 儿童座椅、宠物 │ │ ├── 驾驶员手持物品(电话、吸烟等) │ │ └── 车内异常物体 │ │ │ │ 应用场景: │ │ ├── 危险行为检测(打电话、吸烟) │ │ ├── CPD 儿童检测辅助 │ │ ├── 遗留物检测 │ │ └── 乘员行为分析 │ │ │ └─────────────────────────────────────────────────────────────────────────┘
|
| 特性 |
说明 |
| 模型 |
EfficientDet-Lite |
| 检测范围 |
80 类(COCO 数据集) |
| 速度 |
~20ms (GPU) |
| 模型大小 |
~4MB (Lite0) |
三十五、EfficientDet-Lite 架构
35.1 整体架构
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| ┌─────────────────────────────────────────────────────────────────────────┐ │ EfficientDet-Lite 架构 │ ├─────────────────────────────────────────────────────────────────────────┤ │ │ │ 输入层 │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ Input Image │ │ │ │ (320×320 RGB) │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ Backbone(特征提取) │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ │ │ │ │ EfficientNet-Lite Backbone │ │ │ │ ├── MBConv blocks │ │ │ │ ├── 多尺度特征输出 │ │ │ │ └── 专为移动端优化 │ │ │ │ │ │ │ │ 输出特征层: │ │ │ │ ├── P3: 40×40×72 │ │ │ │ ├── P4: 20×20×120 │ │ │ │ └── P5: 10×10×240 │ │ │ │ │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ BiFPN(特征融合) │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ │ │ │ │ Bi-directional Feature Pyramid Network │ │ │ │ ├── 自顶向下路径 │ │ │ │ ├── 自底向上路径 │ │ │ │ └── 加权特征融合 │ │ │ │ │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ 检测头 │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ │ │ │ │ Class Head: 预测类别 │ │ │ │ Box Head: 预测边界框 │ │ │ │ │ │ │ │ 输出: │ │ │ │ ├── 类别: 91 类 (COCO) │ │ │ │ └── 边界框: (x, y, w, h) × anchors │ │ │ │ │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────────────────────────────┘
|
35.2 BiFPN 特征融合
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
class BiFPNFusion { public: std::vector<Tensor> Fuse(const std::vector<Tensor>& features) { Tensor P5_td = features[2]; Tensor P4_td = WeightedFuse({features[1], Resize(P5_td, features[1].size())}); Tensor P3_td = WeightedFuse({features[0], Resize(P4_td, features[0].size())}); Tensor P3_out = P3_td; Tensor P4_out = WeightedFuse({features[1], P4_td, Resize(P3_out, features[1].size())}); Tensor P5_out = WeightedFuse({features[2], P5_td, Resize(P4_out, features[2].size())}); return {P3_out, P4_out, P5_out}; } private: Tensor WeightedFuse(const std::vector<Tensor>& tensors) { std::vector<float> weights; for (size_t i = 0; i < tensors.size(); ++i) { weights.push_back(learnable_weights_[i]); } for (auto& w : weights) { w = std::max(0.0f, w); } float sum = 0.0f; for (const auto& w : weights) { sum += w; } Tensor output; for (size_t i = 0; i < tensors.size(); ++i) { output += tensors[i] * (weights[i] / (sum + 1e-4f)); } return output; } std::vector<float> learnable_weights_; };
|
三十六、Graph 配置
36.1 完整 Object Detection Graph
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
| # object_detection_graph.pbtxt
input_stream: "IMAGE:image" output_stream: "DETECTIONS:detections"
# ========== 1. 图像预处理 ========== node { calculator: "ImageTransformationCalculator" input_stream: "IMAGE:image" output_stream: "IMAGE:resized_image" options { [mediapipe.ImageTransformationCalculatorOptions.ext] { output_width: 320 output_height: 320 scale_mode: FIT } } }
# ========== 2. 转换为 Tensor ========== node { calculator: "ImageToTensorCalculator" input_stream: "IMAGE:resized_image" output_stream: "TENSORS:input_tensors" options { [mediapipe.ImageToTensorCalculatorOptions.ext] { tensor_width: 320 tensor_height: 320 tensor_channels: 3 tensor_float_range { min: 0.0 max: 1.0 } } } }
# ========== 3. 模型推理 ========== node { calculator: "TfLiteInferenceCalculator" input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:output_tensors" options { [mediapipe.TfLiteInferenceCalculatorOptions.ext] { model_path: "/models/efficientdet_lite0.tflite" delegate { gpu { use_advanced_gpu_api: true } } } } }
# ========== 4. 后处理 ========== node { calculator: "ObjectDetectionPostprocessorCalculator" input_stream: "TENSORS:output_tensors" input_stream: "ORIGINAL_IMAGE_SIZE:image_size" output_stream: "DETECTIONS:detections" options { [mediapipe.ObjectDetectionOptions.ext] { score_threshold: 0.5 max_results: 10 num_classes: 91 } } }
|
36.2 后处理 Calculator
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
|
#include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/formats/detection.pb.h"
namespace mediapipe {
class ObjectDetectionPostprocessorCalculator : public CalculatorBase { public: static absl::Status GetContract(CalculatorContract* cc) { cc->Inputs().Tag("TENSORS").Set<std::vector<TfLiteTensor>>(); cc->Inputs().Tag("ORIGINAL_IMAGE_SIZE").Set<std::pair<int, int>>(); cc->Outputs().Tag("DETECTIONS").Set<std::vector<Detection>>(); cc->Options<ObjectDetectionOptions>(); return absl::OkStatus(); }
absl::Status Open(CalculatorContext* cc) override { const auto& options = cc->Options<ObjectDetectionOptions>(); score_threshold_ = options.score_threshold(); max_results_ = options.max_results(); return absl::OkStatus(); }
absl::Status Process(CalculatorContext* cc) override { const auto& tensors = cc->Inputs().Tag("TENSORS").Get<std::vector<TfLiteTensor>>(); const auto& [img_width, img_height] = cc->Inputs().Tag("ORIGINAL_IMAGE_SIZE").Get<std::pair<int, int>>(); const float* locations = tensors[0].data.f; const float* classes = tensors[1].data.f; const float* scores = tensors[2].data.f; int num_boxes = static_cast<int>(tensors[3].data.f[0]); auto detections = std::make_unique<std::vector<Detection>>(); for (int i = 0; i < num_boxes; ++i) { float score = scores[i]; if (score < score_threshold_) { continue; } int class_id = static_cast<int>(classes[i]); Detection detection; detection.set_score(score); detection.set_label_id(class_id); auto* bbox = detection.mutable_location_data()->mutable_relative_bounding_box(); bbox->set_ymin(locations[i * 4 + 0]); bbox->set_xmin(locations[i * 4 + 1]); bbox->set_ymax(locations[i * 4 + 2]); bbox->set_xmax(locations[i * 4 + 3]); bbox->set_width(bbox->xmax() - bbox->xmin()); bbox->set_height(bbox->ymax() - bbox->ymin()); detections->push_back(detection); } std::sort(detections->begin(), detections->end(), [](const Detection& a, const Detection& b) { return a.score() > b.score(); }); if (detections->size() > max_results_) { detections->resize(max_results_); } cc->Outputs().Tag("DETECTIONS").Add(detections.release(), cc->InputTimestamp()); return absl::OkStatus(); }
private: float score_threshold_ = 0.5f; int max_results_ = 10; };
REGISTER_CALCULATOR(ObjectDetectionPostprocessorCalculator);
}
|
三十七、IMS 实战:车内物体检测
37.1 危险行为检测 Graph
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| # ims_dangerous_behavior_detection.pbtxt
input_stream: "RGB_IMAGE:rgb_image" output_stream: "BEHAVIOR_RESULT:behavior_result" output_stream: "ALERT:alert"
# ========== 1. Object Detection ========== node { calculator: "ObjectDetectionGpu" input_stream: "IMAGE:rgb_image" output_stream: "DETECTIONS:detections" }
# ========== 2. 危险物品过滤 ========== node { calculator: "DangerousObjectFilterCalculator" input_stream: "DETECTIONS:detections" output_stream: "DANGEROUS_OBJECTS:dangerous_objects" options { [mediapipe.DangerousObjectFilterOptions.ext] { dangerous_classes: [67, 73] # cell phone, laptop min_confidence: 0.6 } } }
# ========== 3. Face Mesh(判断是否为驾驶员持有)========== node { calculator: "FaceMeshGpu" input_stream: "IMAGE:rgb_image" output_stream: "LANDMARKS:face_landmarks" }
# ========== 4. Hand Tracking ========== node { calculator: "HandTrackingGpu" input_stream: "IMAGE:rgb_image" output_stream: "LANDMARKS:hand_landmarks" }
# ========== 5. 行为判断 ========== node { calculator: "DangerousBehaviorDecisionCalculator" input_stream: "DANGEROUS_OBJECTS:dangerous_objects" input_stream: "FACE_LANDMARKS:face_landmarks" input_stream: "HAND_LANDMARKS:hand_landmarks" output_stream: "BEHAVIOR_RESULT:behavior_result" output_stream: "ALERT:alert" options { [mediapipe.DangerousBehaviorDecisionOptions.ext] { phone_to_ear_threshold: 0.3 hand_near_mouth_threshold: 0.2 } } }
|
37.2 危险行为判断 Calculator
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
namespace mediapipe {
class DangerousBehaviorDecisionCalculator : public CalculatorBase { public: static absl::Status GetContract(CalculatorContract* cc) { cc->Inputs().Tag("DANGEROUS_OBJECTS").Set<std::vector<Detection>>(); cc->Inputs().Tag("FACE_LANDMARKS").Set<std::vector<NormalizedLandmarkList>>(); cc->Inputs().Tag("HAND_LANDMARKS").Set<std::vector<NormalizedLandmarkList>>(); cc->Outputs().Tag("BEHAVIOR_RESULT").Set<BehaviorResult>(); cc->Outputs().Tag("ALERT").Set<bool>(); return absl::OkStatus(); }
absl::Status Process(CalculatorContext* cc) override { BehaviorResult result; bool alert = false; if (!cc->Inputs().Tag("DANGEROUS_OBJECTS").IsEmpty()) { const auto& objects = cc->Inputs().Tag("DANGEROUS_OBJECTS").Get<std::vector<Detection>>(); for (const auto& obj : objects) { if (obj.label_id() == 67) { result.set_phone_detected(true); if (IsPhoneNearEar(obj, cc)) { result.set_calling_detected(true); alert = true; } } } } if (!cc->Inputs().Tag("HAND_LANDMARKS").IsEmpty()) { const auto& hands = cc->Inputs().Tag("HAND_LANDMARKS").Get<std::vector<NormalizedLandmarkList>>(); if (!hands.empty()) { if (IsSmokingGesture(hands[0])) { result.set_smoking_detected(true); alert = true; } } } cc->Outputs().Tag("BEHAVIOR_RESULT").AddPacket( MakePacket<BehaviorResult>(result).At(cc->InputTimestamp())); cc->Outputs().Tag("ALERT").AddPacket( MakePacket<bool>(alert).At(cc->InputTimestamp())); return absl::OkStatus(); }
private: bool IsPhoneNearEar(const Detection& phone, CalculatorContext* cc) { if (cc->Inputs().Tag("FACE_LANDMARKS").IsEmpty()) { return false; } const auto& faces = cc->Inputs().Tag("FACE_LANDMARKS").Get<std::vector<NormalizedLandmarkList>>(); if (faces.empty()) { return false; } const auto& face = faces[0]; float left_ear_x = face.landmark(234).x(); float left_ear_y = face.landmark(234).y(); float right_ear_x = face.landmark(454).x(); float right_ear_y = face.landmark(454).y(); float phone_x = phone.location_data().relative_bounding_box().xmin() + phone.location_data().relative_bounding_box().width() / 2; float phone_y = phone.location_data().relative_bounding_box().ymin() + phone.location_data().relative_bounding_box().height() / 2; float dist_left = std::sqrt(std::pow(phone_x - left_ear_x, 2) + std::pow(phone_y - left_ear_y, 2)); float dist_right = std::sqrt(std::pow(phone_x - right_ear_x, 2) + std::pow(phone_y - right_ear_y, 2)); float min_dist = std::min(dist_left, dist_right); return min_dist < phone_to_ear_threshold_; } bool IsSmokingGesture(const NormalizedLandmarkList& hand) { float index_tip_y = hand.landmark(8).y(); float index_mcp_y = hand.landmark(5).y(); bool index_bent = index_tip_y > index_mcp_y; float middle_tip_y = hand.landmark(12).y(); float middle_mcp_y = hand.landmark(9).y(); bool middle_bent = middle_tip_y > middle_mcp_y; float thumb_x = hand.landmark(4).x(); float thumb_y = hand.landmark(4).y(); float index_tip_x = hand.landmark(8).x(); float dist = std::sqrt(std::pow(thumb_x - index_tip_x, 2) + std::pow(thumb_y - index_tip_y, 2)); return index_bent && middle_bent && dist < 0.1f; } float phone_to_ear_threshold_ = 0.3f; };
REGISTER_CALCULATOR(DangerousBehaviorDecisionCalculator);
}
|
三十八、总结
| 要点 |
说明 |
| 模型 |
EfficientDet-Lite 轻量级检测器 |
| 特征融合 |
BiFPN 双向加权融合 |
| 检测类别 |
80 类 COCO |
| IMS 应用 |
危险行为检测、车内物品检测 |
下篇预告
MediaPipe 系列 35:Image Segmentation——语义分割
深入讲解图像语义分割、DeepLab 架构、IMS 乘员分割应用。
系列进度: 34/55
更新时间: 2026-03-12