amlnn-model-playground/examples/yolox/py/yolox.py

464 lines
16 KiB
Python
Executable file

# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 Amlogic, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import os
import glob
import argparse
import cv2
from pathlib import Path
from amlnnlite.api import AMLNNLite
# COCO 80 class names
CLASS_NAMES = [
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog",
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
"handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
"wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote",
"keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)):
shape = img.shape[:2] # [height, width]
scale = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
new_unpad = (int(round(shape[1] * scale)), int(round(shape[0] * scale)))
pad_w = (new_shape[1] - new_unpad[0]) / 2
pad_h = (new_shape[0] - new_unpad[1]) / 2
if shape[::-1] != new_unpad:
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
return img, scale, (left, top)
def demo_postprocess(outputs, img_size, p6=False):
"""
YOLOX official demo_postprocess function
Decode model output to absolute coordinates
"""
grids = []
expanded_strides = []
if not p6:
strides = [8, 16, 32]
else:
strides = [8, 16, 32, 64]
hsizes = [img_size[0] // stride for stride in strides]
wsizes = [img_size[1] // stride for stride in strides]
for hsize, wsize, stride in zip(hsizes, wsizes, strides):
xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
grids.append(grid)
shape = grid.shape[:2]
expanded_strides.append(np.full((*shape, 1), stride))
grids = np.concatenate(grids, 1)
expanded_strides = np.concatenate(expanded_strides, 1)
outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
return outputs
def preprocess(img_path, new_shape=(640, 640), data_format='NHWC'):
"""
YOLOX preprocessing function (with ImageNet normalization)
Returns: processed image (HWC format for NHWC, float32, normalized), scale, pad
"""
original_img = cv2.imread(str(img_path))
if original_img is None:
raise ValueError(f"can't read image: {img_path}")
processed_img, scale, pad = letterbox(original_img, new_shape)
rgb_img = cv2.cvtColor(processed_img, cv2.COLOR_BGR2RGB)
# Normalize to 0-1
normalized_img = rgb_img.astype(np.float32) / 255.0
# ImageNet normalization
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
normalized_img = (normalized_img - mean) / std
if data_format == 'NCHW':
# HWC -> CHW -> BCHW
input_tensor = np.transpose(normalized_img, (2, 0, 1))
input_tensor = np.expand_dims(input_tensor, axis=0)
elif data_format == 'NHWC':
# HWC -> BHWC
input_tensor = np.expand_dims(normalized_img, axis=0)
else:
raise ValueError(f"Unsupported data format: {data_format}. Only 'NCHW' and 'NHWC' are supported.")
return input_tensor, original_img, scale, pad
def nms(boxes, scores, nms_thr):
"""Single class NMS implemented in Numpy."""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
if order.size == 1:
break
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thr)[0]
order = order[inds + 1]
return keep
def multiclass_nms(boxes, scores, nms_thr, score_thr):
"""
YOLOX official multiclass_nms function (class-agnostic version)
"""
cls_inds = scores.argmax(1)
cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
return None
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
valid_cls_inds = cls_inds[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if keep:
dets = np.concatenate(
[valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
)
return dets
return None
def postprocess(outputs, scale, pad, img_size=(640, 640), conf_threshold=0.25, iou_threshold=0.45, p6=False):
"""
YOLOX postprocessing (based on python_x.py)
Assumes single output [1, 8400, 85] or multiple outputs that need to be concatenated
"""
# Handle multiple outputs (if AMLNNLite returns multiple scales)
if isinstance(outputs, list):
if len(outputs) == 1:
output = outputs[0]
else:
# Concatenate multiple outputs if needed
# This assumes outputs are already in the correct format
output = outputs[0] # Use first output for now
else:
output = outputs
# Ensure output is in correct format [1, N, 85]
if len(output.shape) == 2:
# [N, 85] -> [1, N, 85]
output = output[None, :, :]
elif len(output.shape) == 3:
# [1, N, 85] or [N, 1, 85]
if output.shape[0] != 1:
output = output.transpose(1, 0, 2)[None, :, :]
elif len(output.shape) == 4:
# [1, 1, N, 85] -> [1, N, 85]
output = output[0, 0]
output = output[None, :, :]
# Convert to float32 if needed (AMLNNLite might return int8)
if output.dtype != np.float32:
output = output.astype(np.float32)
# Use demo_postprocess to decode coordinates
predictions = demo_postprocess(output, img_size, p6=p6)[0] # [8400, 85]
# Extract boxes and scores
# Format after demo_postprocess: [cx, cy, w, h, obj_conf, class0, ..., class79]
boxes = predictions[:, :4] # [cx, cy, w, h] (absolute coordinates)
scores = predictions[:, 4:5] * predictions[:, 5:] # obj_conf * cls_scores
# Convert to xyxy format
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
# Map coordinates back to original image
pad_x, pad_y = pad
boxes_xyxy[:, [0, 2]] = (boxes_xyxy[:, [0, 2]] - pad_x) / scale
boxes_xyxy[:, [1, 3]] = (boxes_xyxy[:, [1, 3]] - pad_y) / scale
boxes_xyxy = np.maximum(boxes_xyxy, 0)
# Multiclass NMS (class-agnostic, score_thr=0.1 as in official YOLOX)
dets = multiclass_nms(boxes_xyxy, scores, nms_thr=iou_threshold, score_thr=0.1)
if dets is None:
return []
# Convert to detection format
final_boxes = dets[:, :4]
final_scores = dets[:, 4]
final_cls_inds = dets[:, 5].astype(int)
detections = []
for i in range(len(dets)):
x1, y1, x2, y2 = final_boxes[i]
confidence = final_scores[i]
class_id = final_cls_inds[i]
if confidence >= conf_threshold:
detections.append({
'bbox': [float(x1), float(y1), float(x2), float(y2)],
'confidence': float(confidence),
'class_id': int(class_id),
'class_name': CLASS_NAMES[class_id] if class_id < len(CLASS_NAMES) else f'class_{class_id}'
})
return detections
# YOLOX color palette (consistent with python_x.py)
_COLORS = (
np.array(
[
0.000, 0.447, 0.741,
0.850, 0.325, 0.098,
0.929, 0.694, 0.125,
0.494, 0.184, 0.556,
0.466, 0.674, 0.188,
0.301, 0.745, 0.933,
0.635, 0.078, 0.184,
0.300, 0.300, 0.300,
0.600, 0.600, 0.600,
1.000, 0.000, 0.000,
1.000, 0.500, 0.000,
0.749, 0.749, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 1.000,
0.667, 0.000, 1.000,
0.333, 0.333, 0.000,
0.333, 0.667, 0.000,
0.333, 1.000, 0.000,
0.667, 0.333, 0.000,
0.667, 0.667, 0.000,
0.667, 1.000, 0.000,
1.000, 0.333, 0.000,
1.000, 0.667, 0.000,
1.000, 1.000, 0.000,
0.000, 0.333, 0.500,
0.000, 0.667, 0.500,
0.000, 1.000, 0.500,
0.333, 0.000, 0.500,
0.333, 0.333, 0.500,
0.333, 0.667, 0.500,
0.333, 1.000, 0.500,
0.667, 0.000, 0.500,
0.667, 0.333, 0.500,
0.667, 0.667, 0.500,
0.667, 1.000, 0.500,
1.000, 0.000, 0.500,
1.000, 0.333, 0.500,
1.000, 0.667, 0.500,
1.000, 1.000, 0.500,
0.000, 0.333, 1.000,
0.000, 0.667, 1.000,
0.000, 1.000, 1.000,
0.333, 0.000, 1.000,
0.333, 0.333, 1.000,
0.333, 0.667, 1.000,
0.333, 1.000, 1.000,
0.667, 0.000, 1.000,
0.667, 0.333, 1.000,
0.667, 0.667, 1.000,
0.667, 1.000, 1.000,
1.000, 0.000, 1.000,
1.000, 0.333, 1.000,
1.000, 0.667, 1.000,
0.333, 0.000, 0.000,
0.500, 0.000, 0.000,
0.667, 0.000, 0.000,
0.833, 0.000, 0.000,
1.000, 0.000, 0.000,
0.000, 0.167, 0.000,
0.000, 0.333, 0.000,
0.000, 0.500, 0.000,
0.000, 0.667, 0.000,
0.000, 0.833, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 0.167,
0.000, 0.000, 0.333,
0.000, 0.000, 0.500,
0.000, 0.000, 0.667,
0.000, 0.000, 0.833,
0.000, 0.000, 1.000,
0.000, 0.000, 0.000,
0.143, 0.143, 0.143,
0.857, 0.857, 0.857,
1.000, 1.000, 1.000
]
).astype(np.float32).reshape(-1, 3)
)
def vis(img, detections, conf=0.5, class_names=None):
"""
YOLOX official visualization function (based on python_x.py)
"""
if class_names is None:
class_names = CLASS_NAMES
result_img = img.copy()
# Adjust font size based on image size
img_height, img_width = img.shape[:2]
font_scale = max(0.6, min(1.2, np.sqrt(img_height * img_height + img_width * img_width) * 0.0015))
thickness = max(2, int(font_scale * 2.5))
for det in detections:
if det['confidence'] < conf:
continue
x1, y1, x2, y2 = [int(coord) for coord in det['bbox']]
confidence = det['confidence']
class_id = det['class_id']
if class_id >= len(_COLORS):
class_id = class_id % len(_COLORS)
color = (_COLORS[class_id] * 255).astype(np.uint8).tolist()
text = '{}:{:.1f}%'.format(det['class_name'], confidence * 100)
txt_color = (0, 0, 0) if np.mean(_COLORS[class_id]) > 0.5 else (255, 255, 255)
font = cv2.FONT_HERSHEY_SIMPLEX
txt_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
cv2.rectangle(result_img, (x1, y1), (x2, y2), color, thickness)
txt_bk_color = (_COLORS[class_id] * 255 * 0.7).astype(np.uint8).tolist()
cv2.rectangle(
result_img,
(x1, y1 + 1),
(x1 + txt_size[0] + 1, y1 + int(1.5 * txt_size[1])),
txt_bk_color,
-1
)
cv2.putText(result_img, text, (x1, y1 + txt_size[1]), font, font_scale, txt_color, thickness=thickness)
return result_img
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--model-path', default='./yolox_s_int8_A311D2.adla')
parser.add_argument('--run-cycles', default= 1, type=int)
parser.add_argument('--input-path', default='./', help='Input image path (file or directory)')
args = parser.parse_args()
# Initialize AMLNNLite
amlnn = AMLNNLite()
amlnn.config(
model_path=args.model_path, # Model file path, Support ADLA and quantized TFlite models
run_cycles=args.run_cycles
)
amlnn.init()
# Find image files
image_files = []
if os.path.isfile(args.input_path):
# Single image file
image_files = [args.input_path]
elif os.path.isdir(args.input_path):
# Directory - find all image files
image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp"]
for ext in image_extensions:
image_files.extend(glob.glob(os.path.join(args.input_path, ext)))
image_files.extend(glob.glob(os.path.join(args.input_path, ext.upper())))
else:
print(f"Error: Input path '{args.input_path}' does not exist")
amlnn.uninit()
return
if not image_files:
print(f"No image files found in {args.input_path}")
amlnn.uninit()
return
print(f"Found {len(image_files)} image files to process:")
for img_file in image_files:
print(f" - {os.path.basename(img_file)}")
print()
# Process each image
for i, image_path in enumerate(image_files, 1):
print(f"=" * 60)
print(f"Processing image {i}/{len(image_files)}: {os.path.basename(image_path)}")
print(f"=" * 60)
try:
# Preprocess input
input_tensor, original_img, scale, pad = preprocess(image_path, new_shape=(640, 640), data_format='NHWC')
# Run inference
outputs = amlnn.inference(
inputs=[input_tensor]
)
# Postprocess results
detections = postprocess(outputs, scale, pad, img_size=(640, 640), conf_threshold=0.25, iou_threshold=0.45, p6=False)
# Print detection results
if detections:
print(f" Detected {len(detections)} objects:")
for i, det in enumerate(detections, 1):
print(f" {i}. {det['class_name']} ({det['confidence']:.2f})")
else:
print(" No objects detected")
# Save result image (save to current directory)
img_name = Path(image_path).stem
save_path = f"{img_name}_result.jpg"
result_img = vis(original_img, detections, conf=0.25, class_names=CLASS_NAMES)
cv2.imwrite(save_path, result_img)
print(f" Result saved to: {save_path}")
except Exception as e:
print(f"Error processing {os.path.basename(image_path)}: {e}")
print()
# Optional visualization
amlnn.visualize()
# Release resources
amlnn.uninit()
if __name__ == "__main__":
main()