一、高通 AI 平台概述
1.1 高通 AI 芯片系列
| 芯片系列 |
典型型号 |
NPU 算力 |
应用场景 |
| 骁龙座舱 |
SA8295P |
30+ TOPS |
高端车载 IVI/IMS |
| 骁龙 Ride |
SA8620P |
100+ TOPS |
L2+ 辅助驾驶 |
| 骁龙汽车 |
SA6155P |
10+ TOPS |
中端车载 |
| 移动 SoC |
SM8550 |
15+ TOPS |
手机/平板 |
1.2 QNN 架构
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| ┌─────────────────────────────────────────────────────────────────────────┐ │ QNN (Qualcomm Neural Network) 架构 │ ├─────────────────────────────────────────────────────────────────────────┤ │ │ │ 应用层 │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ SNPE (Snapdragon Neural Processing Engine) │ │ │ │ ├── 高级 API │ │ │ │ ├── 模型加载、推理、后处理 │ │ │ │ └── 支持 TFLite、ONNX、Caffe 模型 │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ 框架层 │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ QNN Runtime │ │ │ │ ├── Graph 管理 │ │ │ │ ├── 内存管理 │ │ │ │ └── Backend 选择 (CPU/GPU/DSP/NPU) │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ 后端层 │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ HTP │ │ GPU │ │ DSP │ │ CPU │ │ │ │ (NPU) │ │ │ │ │ │ │ │ │ │ 最快 │ │ 较快 │ │ 中等 │ │ 兼容 │ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ │ 硬件层 │ │ ┌─────────────────────────────────────────────────────────┐ │ │ │ Hexagon DSP | Adreno GPU | Hexagon NPU (HTP) │ │ │ └─────────────────────────────────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────────────────────────────┘
|
二、模型转换流程
2.1 TFLite → DLC 转换
SNPE 使用 DLC (Deep Learning Container) 格式:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
|
unzip snpe-2.x.x.zip export SNPE_ROOT=/path/to/snpe-2.x.x
$SNPE_ROOT/bin/x86_64-linux-clang/snpe-pytorch-to-dlc \ --input_network blazeface.tflite \ --input_dim input 1,128,128,3 \ --output_path blazeface.dlc
$SNPE_ROOT/bin/x86_64-linux-clang/snpe-dlc-quantize \ --input_dlc blazeface.dlc \ --input_list input_list.txt \ --output_dlc blazeface_quant.dlc
|
2.2 QNN 模型编译
1 2 3 4 5
| $QNN_SDK_ROOT/bin/x86_64-linux-clang/qnn-context-binary-generator \ --model blazeface_quant.dlc \ --backend libQnnHtp.so \ --output blazeface.bin
|
3.1 Calculator 定义
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
#ifndef MEDIAPIPE_CALCULATORS_QNN_QNN_INFERENCE_CALCULATOR_H_ #define MEDIAPIPE_CALCULATORS_QNN_QNN_INFERENCE_CALCULATOR_H_
#include "mediapipe/framework/calculator_framework.h" #include "mediapipe/framework/formats/tensor.h" #include "DlSystem/ITensor.hpp" #include "DlSystem/TensorMap.hpp" #include "SNPE/SNPE.hpp" #include "SNPE/SNPEFactory.hpp"
namespace mediapipe {
class QnnInferenceCalculator : public CalculatorBase { public: static absl::Status GetContract(CalculatorContract* cc); absl::Status Open(CalculatorContext* cc) override; absl::Status Process(CalculatorContext* cc) override; absl::Status Close(CalculatorContext* cc) override;
private: absl::Status InitializeSnpe(const std::string& model_path); absl::Status SetRuntimeBackend(const std::string& backend); absl::Status RunInference(const Tensor& input, Tensor* output); std::unique_ptr<zdl::SNPE::SNPE> snpe_; std::unique_ptr<zdl::DlSystem::ITensor> input_tensor_; zdl::DlSystem::TensorMap output_tensor_map_; std::string runtime_backend_ = "HTP"; bool use_quantized_ = true; };
}
#endif
|
3.2 Calculator 实现
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
|
#include "mediapipe/calculators/qnn/qnn_inference_calculator.h" #include "DlSystem/StringList.hpp" #include "DlSystem/UserBufferMap.hpp"
namespace mediapipe {
using zdl::DlSystem::ITensor; using zdl::DlSystem::TensorMap; using zdl::SNPE::SNPE; using zdl::SNPE::SNPEFactory;
absl::Status QnnInferenceCalculator::GetContract(CalculatorContract* cc) { cc->Inputs().Tag("TENSORS").Set<std::vector<Tensor>>(); cc->Outputs().Tag("TENSORS").Set<std::vector<Tensor>>(); cc->Options<QnnInferenceOptions>(); return absl::OkStatus(); }
absl::Status QnnInferenceCalculator::Open(CalculatorContext* cc) { const auto& options = cc->Options<QnnInferenceOptions>(); runtime_backend_ = options.runtime_backend(); use_quantized_ = options.use_quantized(); std::string model_path = options.model_path(); MP_RETURN_IF_ERROR(InitializeSnpe(model_path)); LOG(INFO) << "QnnInferenceCalculator initialized with backend: " << runtime_backend_; return absl::OkStatus(); }
absl::Status QnnInferenceCalculator::InitializeSnpe(const std::string& model_path) { std::unique_ptr<zdl::DlContainer::IDlContainer> container; container = zdl::DlContainer::IDlContainer::open(model_path); if (!container) { return absl::InternalError("Failed to load DLC: " + model_path); } zdl::SNPE::SNPEBuilder builder(container.get()); zdl::DlSystem::Runtime_t runtime; if (runtime_backend_ == "CPU") { runtime = zdl::DlSystem::Runtime_t::CPU; } else if (runtime_backend_ == "GPU") { runtime = zdl::DlSystem::Runtime_t::GPU; } else if (runtime_backend_ == "DSP") { runtime = zdl::DlSystem::Runtime_t::DSP; } else if (runtime_backend_ == "HTP") { runtime = zdl::DlSystem::Runtime_t::HTP; } builder.setRuntimeProcessor(runtime); if (use_quantized_) { builder.setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::BURST); } zdl::DlSystem::StringList output_layers; output_layers.append("output"); builder.setOutputLayers(output_layers); snpe_ = builder.build(); if (!snpe_) { return absl::InternalError("Failed to build SNPE"); } return absl::OkStatus(); }
absl::Status QnnInferenceCalculator::Process(CalculatorContext* cc) { if (cc->Inputs().Tag("TENSORS").IsEmpty()) { return absl::OkStatus(); } const auto& input_tensors = cc->Inputs().Tag("TENSORS").Get<std::vector<Tensor>>(); if (input_tensors.empty()) { return absl::OkStatus(); } const Tensor& input_tensor = input_tensors[0]; auto output_tensors = absl::make_unique<std::vector<Tensor>>(); Tensor* output_tensor = &output_tensors->emplace_back(Tensor::ElementType::kFloat32, Tensor::Shape{1, 896, 16}); MP_RETURN_IF_ERROR(RunInference(input_tensor, output_tensor)); cc->Outputs().Tag("TENSORS").Add(output_tensors.release(), cc->InputTimestamp()); return absl::OkStatus(); }
absl::Status QnnInferenceCalculator::RunInference(const Tensor& input, Tensor* output) { auto input_tensor_map = std::unique_ptr<TensorMap>(new TensorMap()); auto input_names = snpe_->getInputTensorNames(); auto& input_name = *input_names.begin(); auto input_shape = snpe_->getInputDimensions(input_name); auto input_itensor = std::unique_ptr<ITensor>( SNPEFactory::createITensor(input_shape)); float* input_data = reinterpret_cast<float*>(input_itensor->getDataPointer()); const float* src_data = input.GetCpuReadView().buffer<float>(); size_t input_size = input.shape().num_elements(); std::memcpy(input_data, src_data, input_size * sizeof(float)); input_tensor_map->add(input_name.c_str(), input_itensor.get()); auto output_tensor_map = std::unique_ptr<TensorMap>(new TensorMap()); bool success = snpe_->execute(input_tensor_map.get(), output_tensor_map.get()); if (!success) { return absl::InternalError("SNPE inference failed"); } auto output_names = snpe_->getOutputTensorNames(); auto& output_name = *output_names.begin(); auto output_itensor = output_tensor_map->getTensor(output_name.c_str()); const float* output_data = reinterpret_cast<const float*>(output_itensor->getDataPointer()); float* dst_data = output->GetCpuWriteView().buffer<float>(); size_t output_size = output->shape().num_elements(); std::memcpy(dst_data, output_data, output_size * sizeof(float)); return absl::OkStatus(); }
absl::Status QnnInferenceCalculator::Close(CalculatorContext* cc) { snpe_.reset(); return absl::OkStatus(); }
REGISTER_CALCULATOR(QnnInferenceCalculator);
}
|
四、Graph 配置
4.1 QNN Face Detection Graph
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
| # mediapipe/graphs/ims/face_detection_qnn.pbtxt
input_stream: "IMAGE:image" output_stream: "DETECTIONS:detections"
# 1. 图像预处理 node { calculator: "ImageToTensorCalculator" input_stream: "IMAGE:image" output_stream: "TENSORS:input_tensors" options { [mediapipe.ImageToTensorCalculatorOptions.ext] { tensor_width: 128 tensor_height: 128 tensor_channels: 3 tensor_float_range { min: -1.0 max: 1.0 } } } }
# 2. QNN 推理 node { calculator: "QnnInferenceCalculator" input_stream: "TENSORS:input_tensors" output_stream: "TENSORS:output_tensors" options { [mediapipe.QnnInferenceOptions.ext] { model_path: "/data/models/blazeface_quant.dlc" runtime_backend: HTP use_quantized: true } } }
# 3. 后处理 node { calculator: "BlazeFacePostprocessorCalculator" input_stream: "TENSORS:output_tensors" output_stream: "DETECTIONS:detections" options { [mediapipe.BlazeFaceOptions.ext] { score_threshold: 0.5 min_suppression_threshold: 0.3 } } }
|
五、性能优化
5.1 量化优化
5.2 内存优化
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
|
int ion_fd = open("/dev/ion", O_RDWR); struct ion_allocation_data alloc = { .len = buffer_size, .heap_id_mask = ION_HEAP_SYSTEM_MASK, .flags = ION_FLAG_CACHED, }; ioctl(ion_fd, ION_IOC_ALLOC, &alloc);
void* buffer = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, MAP_SHARED, ion_fd, alloc.handle);
|
5.3 多模型并行
1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
snpe_face_->setRuntimeProcessor(zdl::DlSystem::Runtime_t::HTP);
snpe_occupant_->setRuntimeProcessor(zdl::DlSystem::Runtime_t::GPU);
std::thread dms_thread([&]() { snpe_face_->execute(...); }); std::thread oms_thread([&]() { snpe_occupant_->execute(...); });
dms_thread.join(); oms_thread.join();
|
六、调试与测试
6.1 性能分析工具
1 2 3 4 5 6 7 8 9
| snpe-throughput-net-run \ --container blazeface_quant.dlc \ --input_list input_list.txt \ --profiling_level detailed \ --output_dir profile_results
cat profile_results/profile.bin | snpe-profiler-analyzer
|
6.2 常见问题排查
| 问题 |
可能原因 |
解决方案 |
| 初始化失败 |
DLC 版本不匹配 |
重新编译模型 |
| 推理结果错误 |
量化精度损失 |
调整量化策略 |
| 内存不足 |
模型过大 |
模型剪枝/量化 |
| HTP 不可用 |
驱动未加载 |
检查 FastRPC |
七、总结
| 要点 |
说明 |
| SNPE |
高级 API,支持多种模型格式 |
| QNN |
底层 Runtime,更高效 |
| HTP |
高通 NPU,最佳性能 |
| 量化 |
INT8 量化可大幅提升性能 |
八、部署脚本
8.1 完整部署脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
| #!/bin/bash
set -e
SNPE_ROOT="/opt/qcom/snpe-2.7.0" QNN_ROOT="/opt/qcom/qnn-2.7.0" TARGET_DEVICE="sa8295"
MODEL_DIR="models" OUTPUT_DIR="deploy"
export SNPE_ROOT export QNN_ROOT export LD_LIBRARY_PATH=$SNPE_ROOT/lib/x86_64-linux-clang:$LD_LIBRARY_PATH
echo "=== SNPE/QNN 环境设置完成 ==="
convert_model() { local model_name=$1 local input_shape=$2 echo "转换模型: $model_name" $SNPE_ROOT/bin/x86_64-linux-clang/snpe-tflite-to-dlc \ --input_network $MODEL_DIR/${model_name}.tflite \ --input_dim input $input_shape \ --output_path $OUTPUT_DIR/${model_name}.dlc $SNPE_ROOT/bin/x86_64-linux-clang/snpe-dlc-quantize \ --input_dlc $OUTPUT_DIR/${model_name}.dlc \ --input_list $MODEL_DIR/input_list.txt \ --output_dlc $OUTPUT_DIR/${model_name}_quant.dlc echo "模型转换完成: $model_name" }
convert_model "blazeface" "1,128,128,3" convert_model "face_landmark" "1,192,192,3" convert_model "iris_landmark" "1,64,64,3"
generate_qnn_binary() { local model_name=$1 echo "生成 QNN Binary: $model_name" $QNN_ROOT/bin/x86_64-linux-clang/qnn-context-binary-generator \ --model $OUTPUT_DIR/${model_name}_quant.dlc \ --backend libQnnHtp.so \ --output $OUTPUT_DIR/${model_name}.bin }
generate_qnn_binary "blazeface" generate_qnn_binary "face_landmark"
echo "推送到设备..." adb push $OUTPUT_DIR/*.dlc /data/local/tmp/ims/ adb push $OUTPUT_DIR/*.bin /data/local/tmp/ims/
echo "=== 部署完成 ==="
|
8.2 性能测试脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| #!/bin/bash
MODEL="blazeface_quant.dlc" BACKEND="htp" ITERATIONS=100
adb shell /data/local/tmp/snpe-benchmark \ --model /data/local/tmp/ims/$MODEL \ --backend $BACKEND \ --iterations $ITERATIONS \ --perf_profile high_performance \ --output /data/local/tmp/benchmark_result.txt
adb pull /data/local/tmp/benchmark_result.txt .
echo "性能测试完成" cat benchmark_result.txt
|
九、常见问题与解决
9.1 模型转换失败
问题: TFLite 转 DLC 报错
1
| Error: Unsupported op: CUSTOM
|
解决:
1 2 3 4 5 6
| $SNPE_ROOT/bin/x86_64-linux-clang/snpe-supported-ops \ --op_package_config $SNPE_ROOT/lib/x86_64-linux-clang/libSnpeOpPackageHtp.so
|
9.2 NPU 推理失败
问题: HTP 后端初始化失败
1
| Error: Failed to initialize HTP backend
|
解决:
1 2 3 4 5 6 7 8
| adb shell cat /sys/devices/soc0/hw_platform
adb shell ls /dev/dsp/*
adb shell "stop vendor.qcom.cdsprpcd; start vendor.qcom.cdsprpcd"
|
9.3 性能不达标
问题: NPU 推理速度慢于预期
优化措施:
- 确保模型已量化(INT8)
- 检查输入数据格式(NHWC vs NCHW)
- 关闭调试输出
- 使用 QNN Context Binary
1 2 3 4
| Snpe_Runtime_Config_t config = Snpe_Runtime_Config_Create(); Snpe_Runtime_Config_SetPerformanceProfile(config, SNPE_PROFILE_HIGH_PERFORMANCE); Snpe_Runtime_Config_SetExecutionPriorityHint(config, SNPE_PRIORITY_HIGH);
|
十、总结
| 要点 |
说明 |
| QNN 架构 |
SNPE → QNN Runtime → HTP/GPU/DSP/CPU |
| 模型转换 |
TFLite → DLC → Quantized DLC → QNN Binary |
| Calculator |
QNN Inference Calculator 封装 |
| 性能优化 |
INT8 量化、QNN Binary、高性能配置 |
参考资料
- Qualcomm. SNPE Documentation
- Qualcomm. QNN SDK
系列进度: 53/55
更新时间: 2026-03-12