amlnn-model-playground/examples/yolox/py/yolox.py

# -*- coding: utf-8 -*-

#
# Copyright (C) 2026 Amlogic, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import os
import glob
import argparse
import cv2
from pathlib import Path
from amlnnlite.api import AMLNNLite

# COCO 80 class names
CLASS_NAMES = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog",
    "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
    "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
    "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
    "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
    "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote",
    "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
    "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]

def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)):
    shape = img.shape[:2]  # [height, width]
    scale = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    new_unpad = (int(round(shape[1] * scale)), int(round(shape[0] * scale)))
    pad_w = (new_shape[1] - new_unpad[0]) / 2
    pad_h = (new_shape[0] - new_unpad[1]) / 2

    if shape[::-1] != new_unpad:
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

    top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
    left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)

    return img, scale, (left, top)

def demo_postprocess(outputs, img_size, p6=False):
    """
    YOLOX official demo_postprocess function
    Decode model output to absolute coordinates
    """
    grids = []
    expanded_strides = []

    if not p6:
        strides = [8, 16, 32]
    else:
        strides = [8, 16, 32, 64]

    hsizes = [img_size[0] // stride for stride in strides]
    wsizes = [img_size[1] // stride for stride in strides]

    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
        grids.append(grid)
        shape = grid.shape[:2]
        expanded_strides.append(np.full((*shape, 1), stride))

    grids = np.concatenate(grids, 1)
    expanded_strides = np.concatenate(expanded_strides, 1)
    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides

    return outputs

def preprocess(img_path, new_shape=(640, 640), data_format='NHWC'):
    """
    YOLOX preprocessing function (with ImageNet normalization)
    Returns: processed image (HWC format for NHWC, float32, normalized), scale, pad
    """
    original_img = cv2.imread(str(img_path))
    if original_img is None:
        raise ValueError(f"can't read image: {img_path}")

    processed_img, scale, pad = letterbox(original_img, new_shape)
    rgb_img = cv2.cvtColor(processed_img, cv2.COLOR_BGR2RGB)

    # Normalize to 0-1
    normalized_img = rgb_img.astype(np.float32) / 255.0

    # ImageNet normalization
    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
    normalized_img = (normalized_img - mean) / std

    if data_format == 'NCHW':
        # HWC -> CHW -> BCHW
        input_tensor = np.transpose(normalized_img, (2, 0, 1))
        input_tensor = np.expand_dims(input_tensor, axis=0)
    elif data_format == 'NHWC':
        # HWC -> BHWC
        input_tensor = np.expand_dims(normalized_img, axis=0)
    else:
        raise ValueError(f"Unsupported data format: {data_format}. Only 'NCHW' and 'NHWC' are supported.")

    return input_tensor, original_img, scale, pad

def nms(boxes, scores, nms_thr):
    """Single class NMS implemented in Numpy."""
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        if order.size == 1:
            break
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= nms_thr)[0]
        order = order[inds + 1]

    return keep

def multiclass_nms(boxes, scores, nms_thr, score_thr):
    """
    YOLOX official multiclass_nms function (class-agnostic version)
    """
    cls_inds = scores.argmax(1)
    cls_scores = scores[np.arange(len(cls_inds)), cls_inds]

    valid_score_mask = cls_scores > score_thr
    if valid_score_mask.sum() == 0:
        return None
    valid_scores = cls_scores[valid_score_mask]
    valid_boxes = boxes[valid_score_mask]
    valid_cls_inds = cls_inds[valid_score_mask]
    keep = nms(valid_boxes, valid_scores, nms_thr)
    if keep:
        dets = np.concatenate(
            [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
        )
        return dets
    return None

def postprocess(outputs, scale, pad, img_size=(640, 640), conf_threshold=0.25, iou_threshold=0.45, p6=False):
    """
    YOLOX postprocessing (based on python_x.py)
    Assumes single output [1, 8400, 85] or multiple outputs that need to be concatenated
    """
    # Handle multiple outputs (if AMLNNLite returns multiple scales)
    if isinstance(outputs, list):
        if len(outputs) == 1:
            output = outputs[0]
        else:
            # Concatenate multiple outputs if needed
            # This assumes outputs are already in the correct format
            output = outputs[0]  # Use first output for now
    else:
        output = outputs

    # Ensure output is in correct format [1, N, 85]
    if len(output.shape) == 2:
        # [N, 85] -> [1, N, 85]
        output = output[None, :, :]
    elif len(output.shape) == 3:
        # [1, N, 85] or [N, 1, 85]
        if output.shape[0] != 1:
            output = output.transpose(1, 0, 2)[None, :, :]
    elif len(output.shape) == 4:
        # [1, 1, N, 85] -> [1, N, 85]
        output = output[0, 0]
        output = output[None, :, :]

    # Convert to float32 if needed (AMLNNLite might return int8)
    if output.dtype != np.float32:
        output = output.astype(np.float32)

    # Use demo_postprocess to decode coordinates
    predictions = demo_postprocess(output, img_size, p6=p6)[0]  # [8400, 85]

    # Extract boxes and scores
    # Format after demo_postprocess: [cx, cy, w, h, obj_conf, class0, ..., class79]
    boxes = predictions[:, :4]  # [cx, cy, w, h] (absolute coordinates)
    scores = predictions[:, 4:5] * predictions[:, 5:]  # obj_conf * cls_scores

    # Convert to xyxy format
    boxes_xyxy = np.ones_like(boxes)
    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0

    # Map coordinates back to original image
    pad_x, pad_y = pad
    boxes_xyxy[:, [0, 2]] = (boxes_xyxy[:, [0, 2]] - pad_x) / scale
    boxes_xyxy[:, [1, 3]] = (boxes_xyxy[:, [1, 3]] - pad_y) / scale
    boxes_xyxy = np.maximum(boxes_xyxy, 0)

    # Multiclass NMS (class-agnostic, score_thr=0.1 as in official YOLOX)
    dets = multiclass_nms(boxes_xyxy, scores, nms_thr=iou_threshold, score_thr=0.1)

    if dets is None:
        return []

    # Convert to detection format
    final_boxes = dets[:, :4]
    final_scores = dets[:, 4]
    final_cls_inds = dets[:, 5].astype(int)

    detections = []
    for i in range(len(dets)):
        x1, y1, x2, y2 = final_boxes[i]
        confidence = final_scores[i]
        class_id = final_cls_inds[i]

        if confidence >= conf_threshold:
            detections.append({
                'bbox': [float(x1), float(y1), float(x2), float(y2)],
                'confidence': float(confidence),
                'class_id': int(class_id),
                'class_name': CLASS_NAMES[class_id] if class_id < len(CLASS_NAMES) else f'class_{class_id}'
            })

    return detections

# YOLOX color palette (consistent with python_x.py)
_COLORS = (
    np.array(
        [
            0.000, 0.447, 0.741,
            0.850, 0.325, 0.098,
            0.929, 0.694, 0.125,
            0.494, 0.184, 0.556,
            0.466, 0.674, 0.188,
            0.301, 0.745, 0.933,
            0.635, 0.078, 0.184,
            0.300, 0.300, 0.300,
            0.600, 0.600, 0.600,
            1.000, 0.000, 0.000,
            1.000, 0.500, 0.000,
            0.749, 0.749, 0.000,
            0.000, 1.000, 0.000,
            0.000, 0.000, 1.000,
            0.667, 0.000, 1.000,
            0.333, 0.333, 0.000,
            0.333, 0.667, 0.000,
            0.333, 1.000, 0.000,
            0.667, 0.333, 0.000,
            0.667, 0.667, 0.000,
            0.667, 1.000, 0.000,
            1.000, 0.333, 0.000,
            1.000, 0.667, 0.000,
            1.000, 1.000, 0.000,
            0.000, 0.333, 0.500,
            0.000, 0.667, 0.500,
            0.000, 1.000, 0.500,
            0.333, 0.000, 0.500,
            0.333, 0.333, 0.500,
            0.333, 0.667, 0.500,
            0.333, 1.000, 0.500,
            0.667, 0.000, 0.500,
            0.667, 0.333, 0.500,
            0.667, 0.667, 0.500,
            0.667, 1.000, 0.500,
            1.000, 0.000, 0.500,
            1.000, 0.333, 0.500,
            1.000, 0.667, 0.500,
            1.000, 1.000, 0.500,
            0.000, 0.333, 1.000,
            0.000, 0.667, 1.000,
            0.000, 1.000, 1.000,
            0.333, 0.000, 1.000,
            0.333, 0.333, 1.000,
            0.333, 0.667, 1.000,
            0.333, 1.000, 1.000,
            0.667, 0.000, 1.000,
            0.667, 0.333, 1.000,
            0.667, 0.667, 1.000,
            0.667, 1.000, 1.000,
            1.000, 0.000, 1.000,
            1.000, 0.333, 1.000,
            1.000, 0.667, 1.000,
            0.333, 0.000, 0.000,
            0.500, 0.000, 0.000,
            0.667, 0.000, 0.000,
            0.833, 0.000, 0.000,
            1.000, 0.000, 0.000,
            0.000, 0.167, 0.000,
            0.000, 0.333, 0.000,
            0.000, 0.500, 0.000,
            0.000, 0.667, 0.000,
            0.000, 0.833, 0.000,
            0.000, 1.000, 0.000,
            0.000, 0.000, 0.167,
            0.000, 0.000, 0.333,
            0.000, 0.000, 0.500,
            0.000, 0.000, 0.667,
            0.000, 0.000, 0.833,
            0.000, 0.000, 1.000,
            0.000, 0.000, 0.000,
            0.143, 0.143, 0.143,
            0.857, 0.857, 0.857,
            1.000, 1.000, 1.000
        ]
    ).astype(np.float32).reshape(-1, 3)
)

def vis(img, detections, conf=0.5, class_names=None):
    """
    YOLOX official visualization function (based on python_x.py)
    """
    if class_names is None:
        class_names = CLASS_NAMES

    result_img = img.copy()

    # Adjust font size based on image size
    img_height, img_width = img.shape[:2]
    font_scale = max(0.6, min(1.2, np.sqrt(img_height * img_height + img_width * img_width) * 0.0015))
    thickness = max(2, int(font_scale * 2.5))

    for det in detections:
        if det['confidence'] < conf:
            continue

        x1, y1, x2, y2 = [int(coord) for coord in det['bbox']]
        confidence = det['confidence']
        class_id = det['class_id']

        if class_id >= len(_COLORS):
            class_id = class_id % len(_COLORS)

        color = (_COLORS[class_id] * 255).astype(np.uint8).tolist()
        text = '{}:{:.1f}%'.format(det['class_name'], confidence * 100)
        txt_color = (0, 0, 0) if np.mean(_COLORS[class_id]) > 0.5 else (255, 255, 255)
        font = cv2.FONT_HERSHEY_SIMPLEX

        txt_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
        cv2.rectangle(result_img, (x1, y1), (x2, y2), color, thickness)

        txt_bk_color = (_COLORS[class_id] * 255 * 0.7).astype(np.uint8).tolist()
        cv2.rectangle(
            result_img,
            (x1, y1 + 1),
            (x1 + txt_size[0] + 1, y1 + int(1.5 * txt_size[1])),
            txt_bk_color,
            -1
        )
        cv2.putText(result_img, text, (x1, y1 + txt_size[1]), font, font_scale, txt_color, thickness=thickness)

    return result_img

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-path', default='./yolox_s_int8_A311D2.adla')
    parser.add_argument('--run-cycles', default= 1, type=int)
    parser.add_argument('--input-path', default='./', help='Input image path (file or directory)')
    args = parser.parse_args()

    # Initialize AMLNNLite
    amlnn = AMLNNLite()
    amlnn.config(
        model_path=args.model_path,           # Model file path, Support ADLA and quantized TFlite models
        run_cycles=args.run_cycles
    )
    amlnn.init()

    # Find image files
    image_files = []
    if os.path.isfile(args.input_path):
        # Single image file
        image_files = [args.input_path]
    elif os.path.isdir(args.input_path):
        # Directory - find all image files
        image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp"]
        for ext in image_extensions:
            image_files.extend(glob.glob(os.path.join(args.input_path, ext)))
            image_files.extend(glob.glob(os.path.join(args.input_path, ext.upper())))
    else:
        print(f"Error: Input path '{args.input_path}' does not exist")
        amlnn.uninit()
        return

    if not image_files:
        print(f"No image files found in {args.input_path}")
        amlnn.uninit()
        return

    print(f"Found {len(image_files)} image files to process:")
    for img_file in image_files:
        print(f"  - {os.path.basename(img_file)}")
    print()

    # Process each image
    for i, image_path in enumerate(image_files, 1):
        print(f"=" * 60)
        print(f"Processing image {i}/{len(image_files)}: {os.path.basename(image_path)}")
        print(f"=" * 60)

        try:
            # Preprocess input
            input_tensor, original_img, scale, pad = preprocess(image_path, new_shape=(640, 640), data_format='NHWC')

            # Run inference
            outputs = amlnn.inference(
                inputs=[input_tensor]
            )

            # Postprocess results
            detections = postprocess(outputs, scale, pad, img_size=(640, 640), conf_threshold=0.25, iou_threshold=0.45, p6=False)

            # Print detection results
            if detections:
                print(f"    Detected {len(detections)} objects:")
                for i, det in enumerate(detections, 1):
                    print(f"      {i}. {det['class_name']} ({det['confidence']:.2f})")
            else:
                print("    No objects detected")

            # Save result image (save to current directory)
            img_name = Path(image_path).stem
            save_path = f"{img_name}_result.jpg"
            result_img = vis(original_img, detections, conf=0.25, class_names=CLASS_NAMES)
            cv2.imwrite(save_path, result_img)
            print(f"    Result saved to: {save_path}")

        except Exception as e:
            print(f"Error processing {os.path.basename(image_path)}: {e}")

        print()

    # Optional visualization
    amlnn.visualize()

    # Release resources
    amlnn.uninit()

if __name__ == "__main__":
    main()