add python demos

This commit is contained in:
dian.yuan 2026-01-08 19:43:28 +08:00
parent 3bdf2003ec
commit c91356fc38
97 changed files with 3250 additions and 290 deletions

View file

@ -0,0 +1,77 @@
#!/bin/bash
set -e
#
# Copyright (C) 20242025 Amlogic, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
usage() {
echo "Usage: $0 [-a <target_abi>]"
echo " -a <target_abi> : Target ABI (default: arm64-v8a)"
echo " -h : Show this help message"
exit 1
}
# Default values
TARGET_ABI=arm64-v8a
# Parse arguments
while getopts 'a:h' opt; do
case "$opt" in
a)
TARGET_ABI=$OPTARG
;;
h)
usage
;;
*)
usage
;;
esac
done
if [ -z "${ANDROID_NDK_PATH}" ]; then
if [ -n "${ANDROID_NDK}" ]; then
ANDROID_NDK_PATH=${ANDROID_NDK}
elif [ -n "${ANDROID_NDK_HOME}" ]; then
ANDROID_NDK_PATH=${ANDROID_NDK_HOME}
else
echo "Error: ANDROID_NDK_PATH is not set."
echo "Please set ANDROID_NDK_PATH to your Android NDK directory."
exit 1
fi
fi
ROOT_PWD=$(cd "$(dirname $0)" && pwd)
BUILD_DIR=${ROOT_PWD}/build/android
echo "Building for Android..."
echo "NDK_PATH: ${ANDROID_NDK_PATH}"
echo "TARGET_ABI: ${TARGET_ABI}"
echo "BUILD_DIR: ${BUILD_DIR}"
mkdir -p ${BUILD_DIR}
cd ${BUILD_DIR}
cmake ../../src \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=${TARGET_ABI} \
-DANDROID_PLATFORM=android-24 \
-DCMAKE_BUILD_TYPE=Release \
-DOpenCV_DIR=${ROOT_PWD}/../../../dependency/opencv/opencv-android-sdk-build/sdk/native/jni/abi-${TARGET_ABI}
make -j4
echo "Build complete. Executable in ${BUILD_DIR}/yolox_demo"

View file

@ -0,0 +1,65 @@
#TODO
#!/bin/bash
set -e
usage() {
echo "Usage: $0 [-a <target_arch>]"
echo " -a <target_arch> : Target architecture (default: aarch64)"
echo " -h : Show this help message"
exit 1
}
# Default values
TARGET_ARCH=aarch64
# Parse arguments
while getopts 'a:h' opt; do
case "$opt" in
a)
TARGET_ARCH=$OPTARG
;;
h)
usage
;;
*)
usage
;;
esac
done
# Default to aarch64-linux-gnu if GCC_COMPILER is not set
GCC_COMPILER=${GCC_COMPILER:-aarch64-linux-gnu}
# Set compilers
export CC=${GCC_COMPILER}-gcc
export CXX=${GCC_COMPILER}-g++
# Validate compiler
if ! command -v ${CC} &> /dev/null; then
echo "Error: Compiler ${CC} not found."
echo "Please set GCC_COMPILER environment variable to your cross-compiler path prefix."
echo "Example: export GCC_COMPILER=/path/to/toolchain/bin/aarch64-linux-gnu"
# Proceeding anyway as user might have custom env setup
else
echo "Using compiler: ${CC}"
fi
ROOT_PWD=$(cd "$(dirname $0)" && pwd)
BUILD_DIR=${ROOT_PWD}/build/linux
echo "Building for Linux..."
echo "COMPILER: ${CC}"
echo "TARGET_ARCH: ${TARGET_ARCH}"
echo "BUILD_DIR: ${BUILD_DIR}"
mkdir -p ${BUILD_DIR}
cd ${BUILD_DIR}
cmake ../../src \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=${TARGET_ARCH} \
-DCMAKE_BUILD_TYPE=Release \
-DOpenCV_DIR=${ROOT_PWD}/../../../dependency/opencv/opencv-linux-aarch64/share/OpenCV
make -j4
echo "Build complete. Executable in ${BUILD_DIR}/yolo11_demo"

View file

@ -0,0 +1,41 @@
cmake_minimum_required(VERSION 3.5)
project(yolo11_demo)
set(CMAKE_CXX_STANDARD 17)
# Set NNSDK path
set(NNSDK_ROOT "${CMAKE_SOURCE_DIR}/../../../../dependency/nnsdk")
include_directories(${NNSDK_ROOT}/include)
include_directories(${CMAKE_SOURCE_DIR}/../../../../common)
# Set dependency path
set(3RDPARTY_DIR "${CMAKE_SOURCE_DIR}/../../../../dependency")
if(CMAKE_SYSTEM_NAME STREQUAL "Android")
if (ANDROID_ABI STREQUAL "arm64-v8a")
link_directories(${NNSDK_ROOT}/lib/android/arm64-v8a)
else()
link_directories(${NNSDK_ROOT}/lib/android/armeabi-v7a)
endif()
# Android needs log
link_libraries(log)
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
link_directories(${NNSDK_ROOT}/lib/linux/lib64_yocto)
endif()
# Find OpenCV
message(STATUS "OpenCV_DIR: ${OpenCV_DIR}")
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
add_executable(yolox_demo
main.cpp
postprocess.cpp
${CMAKE_SOURCE_DIR}/../../../../common/model_loader.cpp
)
target_link_libraries(yolox_demo
${OpenCV_LIBS}
nnsdk
)

153
examples/yolox/cpp/src/main.cpp Executable file
View file

@ -0,0 +1,153 @@
/*
* Copyright (C) 20242025 Amlogic, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <string>
#include <vector>
#include <chrono>
#include <cmath>
#include <opencv2/opencv.hpp>
#include "postprocess.h"
#include "model_loader.h"
static float sigmoid(float x) {
return 1.0f / (1.0f + std::exp(-x));
}
const std::string DEFAULT_OUTPUT_PATH = "./result.jpg";
const int MODEL_INPUT_WIDTH = 640;
const int MODEL_INPUT_HEIGHT = 640;
const float SCORE_THRESHOLD = 0.25f;
const float NMS_THRESHOLD = 0.45f;
const float CONF_THRESHOLD = 0.45f;
const std::vector<std::string> CLASS_NAMES = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog",
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
"handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote",
"keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
};
int main(int argc, char** argv) {
if (argc < 3) {
printf("Usage: %s <model_path> <image_path> [output_path]\n", argv[0]);
return -1;
}
std::string model_path = argv[1];
std::string image_path = argv[2];
std::string output_path = (argc > 3) ? argv[3] : DEFAULT_OUTPUT_PATH;
std::cout << "YOLOX C++ Demo" << std::endl;
std::cout << "Model: " << model_path << std::endl;
std::cout << "Image: " << image_path << std::endl;
std::cout << "Output: " << output_path << std::endl;
// 1. Load Image
cv::Mat origin_img = cv::imread(image_path);
if (origin_img.empty()) {
std::cerr << "Failed to load image from " << image_path << std::endl;
return -1;
}
// 2. Initialize Network
void* context = init_network(model_path.c_str());
if (!context) {
std::cerr << "Failed to initialize network." << std::endl;
return -1;
}
// 3. Preprocess
cv::Mat img;
float scale;
std::tuple<int, int> pad;
std::tie(img, scale, pad) = preproc(origin_img, std::make_tuple(MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH));
int pad_left = std::get<0>(pad);
int pad_top = std::get<1>(pad);
// 4. Run Network
std::tuple<cv::Mat, float, std::tuple<int, int>> input_tuple =
std::make_tuple(img, scale, pad);
auto start_time = std::chrono::high_resolution_clock::now();
void* output_ptr = run_network(context, {input_tuple});
if (!output_ptr) {
std::cerr << "Failed to run network." << std::endl;
uninit_network(context);
return -1;
}
nn_output* outdata = (nn_output*)output_ptr;
// 5. Postprocess
int num_classes = CLASS_NAMES.size();
std::vector<Detection> detections;
if (outdata->num == 1) {
// Single output YOLOX model [1, 8400, 85]
float* output = (float*)outdata->out[0].buf;
int num_boxes = 8400; // Default for YOLOX
if (outdata->out[0].param && outdata->out[0].param->num_of_dims >= 2) {
if (outdata->out[0].param->num_of_dims == 3) {
num_boxes = outdata->out[0].param->sizes[1];
} else if (outdata->out[0].param->num_of_dims == 2) {
num_boxes = outdata->out[0].param->sizes[0];
}
}
demo_postprocess(output, num_boxes, std::make_tuple(MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH), false);
// boxes/scores
std::vector<cv::Rect2f> boxes;
std::vector<std::vector<float>> scores;
extract_boxes_and_scores(
output, num_boxes, num_classes,
scale, pad_left, pad_top,
origin_img.cols, origin_img.rows,
boxes, scores
);
// multiclass_nms
detections = multiclass_nms(boxes, scores, num_classes, NMS_THRESHOLD, 0.1f);
} else {
std::cerr << "Error: Unsupported output count: " << outdata->num << std::endl;
uninit_network(context);
return -1;
}
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> inference_time = end_time - start_time;
std::cout << "Inference + Postprocess time: " << inference_time.count() << " ms" << std::endl;
std::cout << "Detections found: " << detections.size() << std::endl;
// 6. Visualize and Save
cv::Mat result_img = vis(origin_img, detections, CONF_THRESHOLD, CLASS_NAMES);
cv::imwrite(output_path, result_img);
std::cout << "Result saved to " << output_path << std::endl;
// 7. Cleanup
uninit_network(context);
return 0;
}

View file

@ -0,0 +1,409 @@
/*
* Copyright (C) 20242025 Amlogic, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "postprocess.h"
#include <algorithm>
#include <cmath>
#include <numeric>
std::tuple<cv::Mat, float, std::tuple<int, int>> preproc(const cv::Mat& img, std::tuple<int, int> input_size) {
// 1. letterbox (resize + padding)
// 2. BGR to RGB conversion
// 3. Normalize to 0-1 (divide by 255.0)
// Note: NNSDK's model_loader expects HWC format, so return HWC instead of CHW
int input_height = std::get<0>(input_size);
int input_width = std::get<1>(input_size);
// letterbox: calculate scale and padding
float scale = std::min(static_cast<float>(input_height) / img.rows,
static_cast<float>(input_width) / img.cols);
int new_w = static_cast<int>(std::round(img.cols * scale));
int new_h = static_cast<int>(std::round(img.rows * scale));
// resize
cv::Mat resized_img;
if (img.size() != cv::Size(new_w, new_h)) {
cv::resize(img, resized_img, cv::Size(new_w, new_h), 0, 0, cv::INTER_LINEAR);
} else {
resized_img = img.clone();
}
// padding
float pad_w = (input_width - new_w) / 2.0f;
float pad_h = (input_height - new_h) / 2.0f;
int top = static_cast<int>(std::round(pad_h - 0.1f));
int bottom = static_cast<int>(std::round(pad_h + 0.1f));
int left = static_cast<int>(std::round(pad_w - 0.1f));
int right = static_cast<int>(std::round(pad_w + 0.1f));
cv::Mat padded_img;
cv::copyMakeBorder(resized_img, padded_img, top, bottom, left, right,
cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
// BGR to RGB conversion
cv::Mat rgb_img;
cv::cvtColor(padded_img, rgb_img, cv::COLOR_BGR2RGB);
// Normalize to 0-1 range (divide by 255.0)
cv::Mat normalized_img;
rgb_img.convertTo(normalized_img, CV_32F, 1.0 / 255.0);
// mean = [0.485, 0.456, 0.406]
// std = [0.229, 0.224, 0.225]
// Note: Use cv::divide for per-channel division in OpenCV
cv::Scalar mean(0.485f, 0.456f, 0.406f);
cv::Scalar std(0.229f, 0.224f, 0.225f);
normalized_img -= mean;
cv::divide(normalized_img, std, normalized_img);
// Return HWC format, ImageNet normalized float32 image (RGB format)
// Also return scale and padding (left, top) for coordinate mapping
return std::make_tuple(normalized_img, scale, std::make_tuple(left, top));
}
static float sigmoid(float x) {
return 1.0f / (1.0f + std::exp(-x));
}
void demo_postprocess(float* outputs, int num_boxes, std::tuple<int, int> img_size, bool p6) {
int img_height = std::get<0>(img_size);
int img_width = std::get<1>(img_size);
std::vector<int> strides;
if (!p6) {
strides = {8, 16, 32};
} else {
strides = {8, 16, 32, 64};
}
// Calculate grid count for each stride
std::vector<int> hsizes, wsizes;
for (int stride : strides) {
hsizes.push_back(img_height / stride);
wsizes.push_back(img_width / stride);
}
// Build grids and expanded_strides
std::vector<std::vector<float>> grids_list;
std::vector<std::vector<float>> strides_list;
int total_boxes = 0;
for (size_t i = 0; i < strides.size(); ++i) {
int hsize = hsizes[i];
int wsize = wsizes[i];
int stride = strides[i];
int grid_size = hsize * wsize;
std::vector<float> grid(grid_size * 2);
std::vector<float> expanded_stride(grid_size);
for (int h = 0; h < hsize; ++h) {
for (int w = 0; w < wsize; ++w) {
int idx = h * wsize + w;
grid[idx * 2] = static_cast<float>(w);
grid[idx * 2 + 1] = static_cast<float>(h);
expanded_stride[idx] = static_cast<float>(stride);
}
}
grids_list.push_back(grid);
strides_list.push_back(expanded_stride);
total_boxes += grid_size;
}
// Merge all grids and strides
std::vector<float> all_grids(total_boxes * 2);
std::vector<float> all_strides(total_boxes);
int offset = 0;
for (size_t i = 0; i < grids_list.size(); ++i) {
int grid_size = grids_list[i].size() / 2;
for (int j = 0; j < grid_size; ++j) {
all_grids[(offset + j) * 2] = grids_list[i][j * 2];
all_grids[(offset + j) * 2 + 1] = grids_list[i][j * 2 + 1];
all_strides[offset + j] = strides_list[i][j];
}
offset += grid_size;
}
// Apply grid and stride decoding
for (int i = 0; i < num_boxes && i < total_boxes; ++i) {
float* box = outputs + i * 85;
// outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
box[0] = (box[0] + all_grids[i * 2]) * all_strides[i];
box[1] = (box[1] + all_grids[i * 2 + 1]) * all_strides[i];
// outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
box[2] = std::exp(box[2]) * all_strides[i];
box[3] = std::exp(box[3]) * all_strides[i];
}
}
std::vector<int> nms(const std::vector<cv::Rect2f>& boxes, const std::vector<float>& scores, float nms_thr) {
if (boxes.empty()) return {};
// Create indices and sort
std::vector<int> indices(boxes.size());
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(), [&scores](int a, int b) {
return scores[a] > scores[b];
});
std::vector<int> keep;
std::vector<bool> suppressed(boxes.size(), false);
for (size_t i = 0; i < indices.size(); ++i) {
int idx = indices[i];
if (suppressed[idx]) continue;
keep.push_back(idx);
float x1_i = boxes[idx].x;
float y1_i = boxes[idx].y;
float x2_i = boxes[idx].x + boxes[idx].width;
float y2_i = boxes[idx].y + boxes[idx].height;
float area_i = boxes[idx].width * boxes[idx].height;
for (size_t j = i + 1; j < indices.size(); ++j) {
int idx_j = indices[j];
if (suppressed[idx_j]) continue;
float x1_j = boxes[idx_j].x;
float y1_j = boxes[idx_j].y;
float x2_j = boxes[idx_j].x + boxes[idx_j].width;
float y2_j = boxes[idx_j].y + boxes[idx_j].height;
float xx1 = std::max(x1_i, x1_j);
float yy1 = std::max(y1_i, y1_j);
float xx2 = std::min(x2_i, x2_j);
float yy2 = std::min(y2_i, y2_j);
float w = std::max(0.0f, xx2 - xx1);
float h = std::max(0.0f, yy2 - yy1);
float inter = w * h;
float area_j = boxes[idx_j].width * boxes[idx_j].height;
float ovr = inter / (area_i + area_j - inter);
if (ovr > nms_thr) {
suppressed[idx_j] = true;
}
}
}
return keep;
}
std::vector<Detection> multiclass_nms(const std::vector<cv::Rect2f>& boxes,
const std::vector<std::vector<float>>& scores,
int num_classes,
float nms_thr,
float score_thr) {
if (boxes.empty() || scores.empty()) return {};
// Find max class score and class ID for each box
std::vector<float> cls_scores(boxes.size());
std::vector<int> cls_inds(boxes.size());
for (size_t i = 0; i < boxes.size(); ++i) {
float max_score = -1.0f;
int max_idx = -1;
for (int c = 0; c < num_classes; ++c) {
if (scores[i][c] > max_score) {
max_score = scores[i][c];
max_idx = c;
}
}
cls_scores[i] = max_score;
cls_inds[i] = max_idx;
}
// Filter low-score boxes
std::vector<cv::Rect2f> valid_boxes;
std::vector<float> valid_scores;
std::vector<int> valid_cls_inds;
std::vector<int> valid_indices;
for (size_t i = 0; i < boxes.size(); ++i) {
if (cls_scores[i] > score_thr) {
valid_boxes.push_back(boxes[i]);
valid_scores.push_back(cls_scores[i]);
valid_cls_inds.push_back(cls_inds[i]);
valid_indices.push_back(i);
}
}
if (valid_boxes.empty()) return {};
// Execute NMS
std::vector<int> keep = nms(valid_boxes, valid_scores, nms_thr);
// Build results
std::vector<Detection> dets;
for (int idx : keep) {
Detection det;
det.x1 = valid_boxes[idx].x;
det.y1 = valid_boxes[idx].y;
det.x2 = valid_boxes[idx].x + valid_boxes[idx].width;
det.y2 = valid_boxes[idx].y + valid_boxes[idx].height;
det.score = valid_scores[idx];
det.class_id = valid_cls_inds[idx];
dets.push_back(det);
}
return dets;
}
cv::Mat vis(const cv::Mat& img,
const std::vector<Detection>& detections,
float conf_thresh,
const std::vector<std::string>& class_names) {
cv::Mat result = img.clone();
// Adjust font size based on image size
int img_height = img.rows;
int img_width = img.cols;
float font_scale = std::max(0.6f, std::min(1.2f,
static_cast<float>(std::sqrt(img_height * img_height + img_width * img_width)) * 0.0015f));
int thickness = std::max(2, static_cast<int>(font_scale * 2.5f));
// YOLOX color palette
static const std::vector<cv::Scalar> colors = {
cv::Scalar(0, 114, 189), cv::Scalar(217, 83, 25), cv::Scalar(237, 177, 32),
cv::Scalar(126, 47, 142), cv::Scalar(119, 172, 48), cv::Scalar(77, 190, 238),
cv::Scalar(162, 20, 47), cv::Scalar(77, 77, 77), cv::Scalar(153, 153, 153),
cv::Scalar(255, 0, 0), cv::Scalar(255, 128, 0), cv::Scalar(191, 191, 0),
cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(170, 0, 255),
cv::Scalar(85, 85, 0), cv::Scalar(85, 170, 0), cv::Scalar(85, 255, 0),
cv::Scalar(170, 85, 0), cv::Scalar(170, 170, 0), cv::Scalar(170, 255, 0),
cv::Scalar(255, 85, 0), cv::Scalar(255, 170, 0), cv::Scalar(255, 255, 0),
cv::Scalar(0, 85, 128), cv::Scalar(0, 170, 128), cv::Scalar(0, 255, 128),
cv::Scalar(85, 0, 128), cv::Scalar(85, 85, 128), cv::Scalar(85, 170, 128),
cv::Scalar(85, 255, 128), cv::Scalar(170, 0, 128), cv::Scalar(170, 85, 128),
cv::Scalar(170, 170, 128), cv::Scalar(170, 255, 128), cv::Scalar(255, 0, 128),
cv::Scalar(255, 85, 128), cv::Scalar(255, 170, 128), cv::Scalar(255, 255, 128),
cv::Scalar(0, 85, 255), cv::Scalar(0, 170, 255), cv::Scalar(0, 255, 255),
cv::Scalar(85, 0, 255), cv::Scalar(85, 85, 255), cv::Scalar(85, 170, 255),
cv::Scalar(85, 255, 255), cv::Scalar(170, 0, 255), cv::Scalar(170, 85, 255),
cv::Scalar(170, 170, 255), cv::Scalar(170, 255, 255), cv::Scalar(255, 0, 255),
cv::Scalar(255, 85, 255), cv::Scalar(255, 170, 255), cv::Scalar(85, 0, 0),
cv::Scalar(128, 0, 0), cv::Scalar(170, 0, 0), cv::Scalar(213, 0, 0),
cv::Scalar(255, 0, 0), cv::Scalar(0, 43, 0), cv::Scalar(0, 85, 0),
cv::Scalar(0, 128, 0), cv::Scalar(0, 170, 0), cv::Scalar(0, 213, 0),
cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 43), cv::Scalar(0, 0, 85),
cv::Scalar(0, 0, 128), cv::Scalar(0, 0, 170), cv::Scalar(0, 0, 213),
cv::Scalar(0, 0, 255), cv::Scalar(0, 0, 0), cv::Scalar(36, 36, 36),
cv::Scalar(219, 219, 219), cv::Scalar(255, 255, 255)
};
for (const auto& det : detections) {
if (det.score < conf_thresh) continue;
if (det.class_id < 0 || det.class_id >= static_cast<int>(class_names.size())) continue;
int x0 = static_cast<int>(det.x1);
int y0 = static_cast<int>(det.y1);
int x1 = static_cast<int>(det.x2);
int y1 = static_cast<int>(det.y2);
cv::Scalar color = colors[det.class_id % colors.size()];
// Draw bounding box
cv::rectangle(result, cv::Point(x0, y0), cv::Point(x1, y1), color, thickness);
// Prepare text
std::string text = class_names[det.class_id] + ":" + cv::format("%.1f%%", det.score * 100);
// Calculate text size
int baseline = 0;
cv::Size txt_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, font_scale, thickness, &baseline);
// Draw text background
cv::Scalar txt_bk_color = color * 0.7;
cv::rectangle(result,
cv::Point(x0, y0 + 1),
cv::Point(x0 + txt_size.width + 1, y0 + static_cast<int>(1.5 * txt_size.height)),
txt_bk_color, -1);
// Draw text
cv::Scalar txt_color = (cv::mean(color)[0] > 0.5) ? cv::Scalar(0, 0, 0) : cv::Scalar(255, 255, 255);
cv::putText(result, text,
cv::Point(x0, y0 + txt_size.height),
cv::FONT_HERSHEY_SIMPLEX, font_scale, txt_color, thickness);
}
return result;
}
void extract_boxes_and_scores(
float* output,
int num_boxes,
int num_classes,
float scale,
int pad_left,
int pad_top,
int img_width,
int img_height,
std::vector<cv::Rect2f>& boxes,
std::vector<std::vector<float>>& scores)
{
boxes.clear();
scores.clear();
boxes.reserve(num_boxes);
scores.reserve(num_boxes);
// Extract all boxes and scores
for (int i = 0; i < num_boxes; ++i) {
float* box_data = output + i * 85;
// Format after demo_postprocess: [cx, cy, w, h, obj_conf, class0, ..., class79]
float cx = box_data[0];
float cy = box_data[1];
float w = box_data[2];
float h = box_data[3];
// Python: boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
float x1 = cx - w / 2.0f;
float y1 = cy - h / 2.0f;
float x2 = cx + w / 2.0f;
float y2 = cy + h / 2.0f;
// Python: valid_boxes[:, [0, 2]] = (valid_boxes[:, [0, 2]] - pad_x) / scale
// Python: valid_boxes[:, [1, 3]] = (valid_boxes[:, [1, 3]] - pad_y) / scale
x1 = (x1 - pad_left) / scale;
y1 = (y1 - pad_top) / scale;
x2 = (x2 - pad_left) / scale;
y2 = (y2 - pad_top) / scale;
// Ensure coordinates are within image bounds
x1 = std::max(0.0f, std::min(static_cast<float>(img_width), x1));
y1 = std::max(0.0f, std::min(static_cast<float>(img_height), y1));
x2 = std::max(0.0f, std::min(static_cast<float>(img_width), x2));
y2 = std::max(0.0f, std::min(static_cast<float>(img_height), y2));
boxes.push_back(cv::Rect2f(x1, y1, x2 - x1, y2 - y1));
// Calculate class scores (obj_conf * cls_scores)
float obj_conf = box_data[4];
std::vector<float> cls_scores(num_classes);
for (int c = 0; c < num_classes; ++c) {
float cls_score = box_data[5 + c];
cls_scores[c] = obj_conf * cls_score;
}
scores.push_back(cls_scores);
}
}

View file

@ -0,0 +1,112 @@
/*
* Copyright (C) 20242025 Amlogic, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef YOLOX_POSTPROCESS_H
#define YOLOX_POSTPROCESS_H
#include <opencv2/opencv.hpp>
#include <vector>
#include <tuple>
#include <string>
/**
* YOLOX preprocessing function
* @param img Input image (BGR format)
* @param input_size Target size (height, width)
* @return Processed image (HWC format, float32, ImageNet normalized, RGB format), scale factor, padding (left, top)
* Note: NNSDK's model_loader expects HWC format, so return HWC instead of CHW
* Processing steps:
* 1. letterbox (resize + padding with 114)
* 2. BGR to RGB conversion
* 3. Normalize to 0-1 (divide by 255.0)
* 4. ImageNet normalization (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
*/
std::tuple<cv::Mat, float, std::tuple<int, int>> preproc(const cv::Mat& img, std::tuple<int, int> input_size);
// Detection result structure
struct Detection {
float x1, y1, x2, y2; // Bounding box coordinates
float score; // Confidence score
int class_id; // Predicted class ID
};
/**
* YOLOX official demo_postprocess function (C++ implementation)
* Decode model output to absolute coordinates
* @param outputs Model output [batch, num_boxes, 85]
* @param num_boxes Number of detection boxes
* @param img_size Input image size (height, width)
* @param p6 Whether to use P6 (default false, use P5)
* @return Decoded output [num_boxes, 85], format: [cx, cy, w, h, obj_conf, class0, ..., class79]
*/
void demo_postprocess(float* outputs, int num_boxes, std::tuple<int, int> img_size, bool p6 = false);
/**
* Extract boxes and scores from output after demo_postprocess
* @param output Model output (processed by demo_postprocess) [num_boxes * 85]
* @param num_boxes Number of detection boxes
* @param num_classes Number of classes
* @param scale Scale factor from preprocessing
* @param pad_left Left padding boundary
* @param pad_top Top padding boundary
* @param img_width Original image width
* @param img_height Original image height
* @param boxes Output boxes (xyxy format, mapped to original image size)
* @param scores Output scores (class scores for each box, obj_conf * cls_scores)
*/
void extract_boxes_and_scores(
float* output,
int num_boxes,
int num_classes,
float scale,
int pad_left,
int pad_top,
int img_width,
int img_height,
std::vector<cv::Rect2f>& boxes,
std::vector<std::vector<float>>& scores
);
/**
* Single-class NMS
*/
std::vector<int> nms(const std::vector<cv::Rect2f>& boxes, const std::vector<float>& scores, float nms_thr);
/**
* YOLOX official multiclass_nms function (class-agnostic version)
* @param boxes Detection boxes [N, 4] (x1, y1, x2, y2)
* @param scores Class scores [N, num_classes]
* @param num_classes Number of classes
* @param nms_thr NMS threshold
* @param score_thr Score threshold
* @return Detection results, each row is [x1, y1, x2, y2, score, class_id]
*/
std::vector<Detection> multiclass_nms(const std::vector<cv::Rect2f>& boxes,
const std::vector<std::vector<float>>& scores,
int num_classes,
float nms_thr,
float score_thr);
/**
* Visualize detection results (consistent with Python version, supports adaptive font size)
*/
cv::Mat vis(const cv::Mat& img,
const std::vector<Detection>& detections,
float conf_thresh,
const std::vector<std::string>& class_names);
#endif // YOLOX_POSTPROCESS_H

View file

448
examples/yolox/py/yolox.py Executable file
View file

@ -0,0 +1,448 @@
# -*- coding: utf-8 -*-
import numpy as np
import os
import glob
import argparse
import cv2
from pathlib import Path
from amlnnlite.api import AMLNNLite
# COCO 80 class names
CLASS_NAMES = [
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog",
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
"handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote",
"keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)):
shape = img.shape[:2] # [height, width]
scale = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
new_unpad = (int(round(shape[1] * scale)), int(round(shape[0] * scale)))
pad_w = (new_shape[1] - new_unpad[0]) / 2
pad_h = (new_shape[0] - new_unpad[1]) / 2
if shape[::-1] != new_unpad:
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
return img, scale, (left, top)
def demo_postprocess(outputs, img_size, p6=False):
"""
YOLOX official demo_postprocess function
Decode model output to absolute coordinates
"""
grids = []
expanded_strides = []
if not p6:
strides = [8, 16, 32]
else:
strides = [8, 16, 32, 64]
hsizes = [img_size[0] // stride for stride in strides]
wsizes = [img_size[1] // stride for stride in strides]
for hsize, wsize, stride in zip(hsizes, wsizes, strides):
xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
grids.append(grid)
shape = grid.shape[:2]
expanded_strides.append(np.full((*shape, 1), stride))
grids = np.concatenate(grids, 1)
expanded_strides = np.concatenate(expanded_strides, 1)
outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
return outputs
def preprocess(img_path, new_shape=(640, 640), data_format='NHWC'):
"""
YOLOX preprocessing function (with ImageNet normalization)
Returns: processed image (HWC format for NHWC, float32, normalized), scale, pad
"""
original_img = cv2.imread(str(img_path))
if original_img is None:
raise ValueError(f"can't read image: {img_path}")
processed_img, scale, pad = letterbox(original_img, new_shape)
rgb_img = cv2.cvtColor(processed_img, cv2.COLOR_BGR2RGB)
# Normalize to 0-1
normalized_img = rgb_img.astype(np.float32) / 255.0
# ImageNet normalization
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
normalized_img = (normalized_img - mean) / std
if data_format == 'NCHW':
# HWC -> CHW -> BCHW
input_tensor = np.transpose(normalized_img, (2, 0, 1))
input_tensor = np.expand_dims(input_tensor, axis=0)
elif data_format == 'NHWC':
# HWC -> BHWC
input_tensor = np.expand_dims(normalized_img, axis=0)
else:
raise ValueError(f"Unsupported data format: {data_format}. Only 'NCHW' and 'NHWC' are supported.")
return input_tensor, original_img, scale, pad
def nms(boxes, scores, nms_thr):
"""Single class NMS implemented in Numpy."""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
if order.size == 1:
break
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thr)[0]
order = order[inds + 1]
return keep
def multiclass_nms(boxes, scores, nms_thr, score_thr):
"""
YOLOX official multiclass_nms function (class-agnostic version)
"""
cls_inds = scores.argmax(1)
cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
return None
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
valid_cls_inds = cls_inds[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if keep:
dets = np.concatenate(
[valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
)
return dets
return None
def postprocess(outputs, scale, pad, img_size=(640, 640), conf_threshold=0.25, iou_threshold=0.45, p6=False):
"""
YOLOX postprocessing (based on python_x.py)
Assumes single output [1, 8400, 85] or multiple outputs that need to be concatenated
"""
# Handle multiple outputs (if AMLNNLite returns multiple scales)
if isinstance(outputs, list):
if len(outputs) == 1:
output = outputs[0]
else:
# Concatenate multiple outputs if needed
# This assumes outputs are already in the correct format
output = outputs[0] # Use first output for now
else:
output = outputs
# Ensure output is in correct format [1, N, 85]
if len(output.shape) == 2:
# [N, 85] -> [1, N, 85]
output = output[None, :, :]
elif len(output.shape) == 3:
# [1, N, 85] or [N, 1, 85]
if output.shape[0] != 1:
output = output.transpose(1, 0, 2)[None, :, :]
elif len(output.shape) == 4:
# [1, 1, N, 85] -> [1, N, 85]
output = output[0, 0]
output = output[None, :, :]
# Convert to float32 if needed (AMLNNLite might return int8)
if output.dtype != np.float32:
output = output.astype(np.float32)
# Use demo_postprocess to decode coordinates
predictions = demo_postprocess(output, img_size, p6=p6)[0] # [8400, 85]
# Extract boxes and scores
# Format after demo_postprocess: [cx, cy, w, h, obj_conf, class0, ..., class79]
boxes = predictions[:, :4] # [cx, cy, w, h] (absolute coordinates)
scores = predictions[:, 4:5] * predictions[:, 5:] # obj_conf * cls_scores
# Convert to xyxy format
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
# Map coordinates back to original image
pad_x, pad_y = pad
boxes_xyxy[:, [0, 2]] = (boxes_xyxy[:, [0, 2]] - pad_x) / scale
boxes_xyxy[:, [1, 3]] = (boxes_xyxy[:, [1, 3]] - pad_y) / scale
boxes_xyxy = np.maximum(boxes_xyxy, 0)
# Multiclass NMS (class-agnostic, score_thr=0.1 as in official YOLOX)
dets = multiclass_nms(boxes_xyxy, scores, nms_thr=iou_threshold, score_thr=0.1)
if dets is None:
return []
# Convert to detection format
final_boxes = dets[:, :4]
final_scores = dets[:, 4]
final_cls_inds = dets[:, 5].astype(int)
detections = []
for i in range(len(dets)):
x1, y1, x2, y2 = final_boxes[i]
confidence = final_scores[i]
class_id = final_cls_inds[i]
if confidence >= conf_threshold:
detections.append({
'bbox': [float(x1), float(y1), float(x2), float(y2)],
'confidence': float(confidence),
'class_id': int(class_id),
'class_name': CLASS_NAMES[class_id] if class_id < len(CLASS_NAMES) else f'class_{class_id}'
})
return detections
# YOLOX color palette (consistent with python_x.py)
_COLORS = (
np.array(
[
0.000, 0.447, 0.741,
0.850, 0.325, 0.098,
0.929, 0.694, 0.125,
0.494, 0.184, 0.556,
0.466, 0.674, 0.188,
0.301, 0.745, 0.933,
0.635, 0.078, 0.184,
0.300, 0.300, 0.300,
0.600, 0.600, 0.600,
1.000, 0.000, 0.000,
1.000, 0.500, 0.000,
0.749, 0.749, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 1.000,
0.667, 0.000, 1.000,
0.333, 0.333, 0.000,
0.333, 0.667, 0.000,
0.333, 1.000, 0.000,
0.667, 0.333, 0.000,
0.667, 0.667, 0.000,
0.667, 1.000, 0.000,
1.000, 0.333, 0.000,
1.000, 0.667, 0.000,
1.000, 1.000, 0.000,
0.000, 0.333, 0.500,
0.000, 0.667, 0.500,
0.000, 1.000, 0.500,
0.333, 0.000, 0.500,
0.333, 0.333, 0.500,
0.333, 0.667, 0.500,
0.333, 1.000, 0.500,
0.667, 0.000, 0.500,
0.667, 0.333, 0.500,
0.667, 0.667, 0.500,
0.667, 1.000, 0.500,
1.000, 0.000, 0.500,
1.000, 0.333, 0.500,
1.000, 0.667, 0.500,
1.000, 1.000, 0.500,
0.000, 0.333, 1.000,
0.000, 0.667, 1.000,
0.000, 1.000, 1.000,
0.333, 0.000, 1.000,
0.333, 0.333, 1.000,
0.333, 0.667, 1.000,
0.333, 1.000, 1.000,
0.667, 0.000, 1.000,
0.667, 0.333, 1.000,
0.667, 0.667, 1.000,
0.667, 1.000, 1.000,
1.000, 0.000, 1.000,
1.000, 0.333, 1.000,
1.000, 0.667, 1.000,
0.333, 0.000, 0.000,
0.500, 0.000, 0.000,
0.667, 0.000, 0.000,
0.833, 0.000, 0.000,
1.000, 0.000, 0.000,
0.000, 0.167, 0.000,
0.000, 0.333, 0.000,
0.000, 0.500, 0.000,
0.000, 0.667, 0.000,
0.000, 0.833, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 0.167,
0.000, 0.000, 0.333,
0.000, 0.000, 0.500,
0.000, 0.000, 0.667,
0.000, 0.000, 0.833,
0.000, 0.000, 1.000,
0.000, 0.000, 0.000,
0.143, 0.143, 0.143,
0.857, 0.857, 0.857,
1.000, 1.000, 1.000
]
).astype(np.float32).reshape(-1, 3)
)
def vis(img, detections, conf=0.5, class_names=None):
"""
YOLOX official visualization function (based on python_x.py)
"""
if class_names is None:
class_names = CLASS_NAMES
result_img = img.copy()
# Adjust font size based on image size
img_height, img_width = img.shape[:2]
font_scale = max(0.6, min(1.2, np.sqrt(img_height * img_height + img_width * img_width) * 0.0015))
thickness = max(2, int(font_scale * 2.5))
for det in detections:
if det['confidence'] < conf:
continue
x1, y1, x2, y2 = [int(coord) for coord in det['bbox']]
confidence = det['confidence']
class_id = det['class_id']
if class_id >= len(_COLORS):
class_id = class_id % len(_COLORS)
color = (_COLORS[class_id] * 255).astype(np.uint8).tolist()
text = '{}:{:.1f}%'.format(det['class_name'], confidence * 100)
txt_color = (0, 0, 0) if np.mean(_COLORS[class_id]) > 0.5 else (255, 255, 255)
font = cv2.FONT_HERSHEY_SIMPLEX
txt_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
cv2.rectangle(result_img, (x1, y1), (x2, y2), color, thickness)
txt_bk_color = (_COLORS[class_id] * 255 * 0.7).astype(np.uint8).tolist()
cv2.rectangle(
result_img,
(x1, y1 + 1),
(x1 + txt_size[0] + 1, y1 + int(1.5 * txt_size[1])),
txt_bk_color,
-1
)
cv2.putText(result_img, text, (x1, y1 + txt_size[1]), font, font_scale, txt_color, thickness=thickness)
return result_img
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--model-path', default='./yolox_s_int8_A311D2.adla')
parser.add_argument('--run-cycles', default= 1, type=int)
parser.add_argument('--input-path', default='./', help='Input image path (file or directory)')
args = parser.parse_args()
# Initialize AMLNNLite
amlnn = AMLNNLite()
amlnn.config(
model_path=args.model_path, # Model file path, Support ADLD and quantized TFlite models
run_cycles=args.run_cycles
)
amlnn.init()
# Find image files
image_files = []
if os.path.isfile(args.input_path):
# Single image file
image_files = [args.input_path]
elif os.path.isdir(args.input_path):
# Directory - find all image files
image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp"]
for ext in image_extensions:
image_files.extend(glob.glob(os.path.join(args.input_path, ext)))
image_files.extend(glob.glob(os.path.join(args.input_path, ext.upper())))
else:
print(f"Error: Input path '{args.input_path}' does not exist")
amlnn.uninit()
return
if not image_files:
print(f"No image files found in {args.input_path}")
amlnn.uninit()
return
print(f"Found {len(image_files)} image files to process:")
for img_file in image_files:
print(f" - {os.path.basename(img_file)}")
print()
# Process each image
for i, image_path in enumerate(image_files, 1):
print(f"=" * 60)
print(f"Processing image {i}/{len(image_files)}: {os.path.basename(image_path)}")
print(f"=" * 60)
try:
# Preprocess input
input_tensor, original_img, scale, pad = preprocess(image_path, new_shape=(640, 640), data_format='NHWC')
# Run inference
outputs = amlnn.inference(
inputs=[input_tensor]
)
# Postprocess results
detections = postprocess(outputs, scale, pad, img_size=(640, 640), conf_threshold=0.25, iou_threshold=0.45, p6=False)
# Print detection results
if detections:
print(f" Detected {len(detections)} objects:")
for i, det in enumerate(detections, 1):
print(f" {i}. {det['class_name']} ({det['confidence']:.2f})")
else:
print(" No objects detected")
# Save result image (save to current directory)
img_name = Path(image_path).stem
save_path = f"{img_name}_result.jpg"
result_img = vis(original_img, detections, conf=0.25, class_names=CLASS_NAMES)
cv2.imwrite(save_path, result_img)
print(f" Result saved to: {save_path}")
except Exception as e:
print(f"Error processing {os.path.basename(image_path)}: {e}")
print()
# Optional visualization
amlnn.visualize()
# Release resources
amlnn.uninit()
if __name__ == "__main__":
main()