amlnn-model-playground/examples/gesture/py/gesture.py

# -*- coding: utf-8 -*-
"""
Copyright (C) 2024–2025 Amlogic, Inc. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import cv2
import glob
import argparse
import numpy as np
from pathlib import Path
from amlnnlite.api import AMLNNLite


NAMES = [
    'ok', 'stop', 'palm', 'like', 'dislike', 'no_gesture', 'call', 'fist',
    'four', 'mute', 'one', 'peace', 'peace_inverted', 'rock',
    'stop_inverted', 'three', 'three2', 'two_up', 'two_up_inverted'
]

INPUT_SIZE = 640

STRIDES = [32.0, 16.0, 8.0]
GRIDS = [20, 40, 80]
ANCHOR_GRIDS = [
    np.array([116, 90, 156, 198, 373, 326], dtype=np.float32).reshape(1, 3, 1, 1, 2),
    np.array([30, 61, 62, 45, 59, 119], dtype=np.float32).reshape(1, 3, 1, 1, 2),
    np.array([10, 13, 16, 30, 33, 23], dtype=np.float32).reshape(1, 3, 1, 1, 2),
]


def preprocess_bgr(bgr: np.ndarray):
    h0, w0 = bgr.shape[:2]

    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    rgb = cv2.resize(rgb, (INPUT_SIZE, INPUT_SIZE))
    rgb = rgb.astype(np.float32) / 255.0

    nchw = np.transpose(rgb, (2, 0, 1))[None, ...]
    nhwc = np.transpose(nchw, (0, 2, 3, 1))

    return nhwc, w0, h0


def xywh2xyxy(boxes: np.ndarray) -> np.ndarray:
    x = boxes[:, 0]
    y = boxes[:, 1]
    w = boxes[:, 2]
    h = boxes[:, 3]
    x1 = x - w / 2.0
    y1 = y - h / 2.0
    x2 = x + w / 2.0
    y2 = y + h / 2.0
    return np.stack([x1, y1, x2, y2], axis=1)


def box_iou_one(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
    x1 = np.maximum(box[0], boxes[:, 0])
    y1 = np.maximum(box[1], boxes[:, 1])
    x2 = np.minimum(box[2], boxes[:, 2])
    y2 = np.minimum(box[3], boxes[:, 3])

    inter_w = np.maximum(0.0, x2 - x1)
    inter_h = np.maximum(0.0, y2 - y1)
    inter = inter_w * inter_h

    area1 = np.maximum(0.0, box[2] - box[0]) * np.maximum(0.0, box[3] - box[1])
    area2 = np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) * np.maximum(0.0, boxes[:, 3] - boxes[:, 1])

    union = area1 + area2 - inter + 1e-6
    return inter / union


def nms(boxes: np.ndarray, scores: np.ndarray, iou_thres: float = 0.45):
    order = np.argsort(-scores)
    keep = []

    while len(order) > 0:
        i = order[0]
        keep.append(i)
        if len(order) == 1:
            break
        ious = box_iou_one(boxes[i], boxes[order[1:]])
        order = order[1:][ious <= iou_thres]

    return keep


def decode_one_output(y: np.ndarray, g: int, stride: float, anchor_grid: np.ndarray) -> np.ndarray:
    y = np.transpose(y, (0, 1, 3, 2))
    y = y.reshape(1, g, g, 3, 24)
    y = np.transpose(y, (0, 3, 1, 2, 4))

    yv, xv = np.meshgrid(np.arange(g), np.arange(g), indexing='ij')
    grid = np.stack((xv, yv), axis=-1).reshape(1, 1, g, g, 2).astype(np.float32)

    xy = (y[..., 0:2] * 2.0 - 0.5 + grid) * stride
    wh = (y[..., 2:4] * 2.0) ** 2 * anchor_grid
    obj = y[..., 4:5]
    cls = y[..., 5:]

    decoded = np.concatenate([xy, wh, obj, cls], axis=-1)
    decoded = decoded.reshape(1, 3 * g * g, 24)
    return decoded


def decode_outputs(output_tensors):
    decoded_all = []
    for i, y in enumerate(output_tensors):
        decoded = decode_one_output(
            y=y,
            g=GRIDS[i],
            stride=STRIDES[i],
            anchor_grid=ANCHOR_GRIDS[i]
        )
        decoded_all.append(decoded)

    pred = np.concatenate(decoded_all, axis=1)
    return pred[0]


def postprocess(pred: np.ndarray, conf_thres: float = 0.25, nms_thres: float = 0.45):
    boxes_xywh = pred[:, 0:4]
    obj = pred[:, 4]
    cls_scores = pred[:, 5:]

    class_ids = np.argmax(cls_scores, axis=1)
    class_scores = np.max(cls_scores, axis=1)
    scores = obj * class_scores

    mask = scores > conf_thres
    if mask.sum() == 0:
        return [], [], []

    sel_boxes_xywh = boxes_xywh[mask]
    sel_scores = scores[mask]
    sel_class_ids = class_ids[mask]

    sel_boxes_xyxy = xywh2xyxy(sel_boxes_xywh)

    final_boxes = []
    final_scores = []
    final_class_ids = []

    unique_classes = np.unique(sel_class_ids)
    for cid in unique_classes:
        cls_mask = sel_class_ids == cid
        cls_boxes = sel_boxes_xyxy[cls_mask]
        cls_scores_part = sel_scores[cls_mask]

        keep = nms(cls_boxes, cls_scores_part, iou_thres=nms_thres)
        for k in keep:
            final_boxes.append(cls_boxes[k].copy())
            final_scores.append(float(cls_scores_part[k]))
            final_class_ids.append(int(cid))

    if len(final_boxes) == 0:
        return [], [], []

    final_boxes = np.asarray(final_boxes, dtype=np.float32)
    final_scores = np.asarray(final_scores, dtype=np.float32)
    final_class_ids = np.asarray(final_class_ids, dtype=np.int32)

    order = np.argsort(-final_scores)
    return final_boxes[order], final_scores[order], final_class_ids[order]


def scale_boxes_to_original(boxes_xyxy: np.ndarray, orig_w: int, orig_h: int):
    if len(boxes_xyxy) == 0:
        return boxes_xyxy

    scale_x = orig_w / float(INPUT_SIZE)
    scale_y = orig_h / float(INPUT_SIZE)

    boxes = boxes_xyxy.copy()
    boxes[:, [0, 2]] *= scale_x
    boxes[:, [1, 3]] *= scale_y

    boxes[:, 0] = np.clip(boxes[:, 0], 0, orig_w - 1)
    boxes[:, 2] = np.clip(boxes[:, 2], 0, orig_w - 1)
    boxes[:, 1] = np.clip(boxes[:, 1], 0, orig_h - 1)
    boxes[:, 3] = np.clip(boxes[:, 3], 0, orig_h - 1)

    return boxes


def draw_detections(bgr: np.ndarray, boxes, scores, class_ids):
    vis = bgr.copy()
    h, w = vis.shape[:2]

    font_scale = max(0.8, min(w, h) / 600.0)
    font_thickness = max(2, int(min(w, h) / 300))
    box_thickness = max(2, int(min(w, h) / 250))

    for box, score, cid in zip(boxes, scores, class_ids):
        x1, y1, x2, y2 = box.astype(int)
        label = f'{NAMES[int(cid)]} {float(score):.2f}'

        cv2.rectangle(vis, (x1, y1), (x2, y2), (0, 255, 0), box_thickness)
        text_y = max(30, y1 - 10)
        cv2.putText(
            vis, label, (x1, text_y),
            cv2.FONT_HERSHEY_SIMPLEX,
            font_scale, (0, 255, 0), font_thickness, cv2.LINE_AA
        )

    return vis


def infer_bgr(amlnn, bgr, conf_thresh=0.25, nms_thresh=0.45):
    inp, orig_w, orig_h = preprocess_bgr(bgr)

    outputs = amlnn.inference(inp, inputs_data_format='NHWC')
    output_tensors = [np.asarray(out) for out in outputs]

    def get_grid_num(x):
        s = np.asarray(x).shape
        if len(s) != 4:
            raise ValueError(f"Unexpected output shape: {s}")
        return int(s[1])

    output_tensors = sorted(output_tensors, key=get_grid_num)  # 400, 1600, 6400

    pred = decode_outputs(output_tensors)
    boxes, scores, class_ids = postprocess(pred, conf_thres=conf_thresh, nms_thres=nms_thresh)
    boxes = scale_boxes_to_original(boxes, orig_w, orig_h)

    boxes_xyxy = [tuple(map(int, box)) for box in boxes]
    scores = [float(x) for x in scores]
    class_ids = [int(x) for x in class_ids]

    return boxes_xyxy, scores, class_ids


def main():
    parser = argparse.ArgumentParser(description="Gesture AMLNNLite Demo")
    parser.add_argument('--board-work-path', type=str, default='/data/local/tmp')
    parser.add_argument('--model-path', required=True, help='Path to .adla model')
    parser.add_argument('--image-dir', required=True, help='Directory of test images')
    parser.add_argument('--run-cycles', type=int, default=1, help='Inference cycles')
    parser.add_argument('--loglevel', type=str, default='WARNING',
                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'])
    parser.add_argument('--conf-thres', type=float, default=0.25)
    parser.add_argument('--nms-thres', type=float, default=0.3)
    parser.add_argument('--top1-only', action='store_true', help='Only keep the highest score detection')
    args = parser.parse_args()

    amlnn = AMLNNLite()
    amlnn.config(
        board_work_path=args.board_work_path,
        model_path=args.model_path,
        run_cycles=args.run_cycles,
        loglevel=args.loglevel
    )
    amlnn.init()

    image_files = sorted(glob.glob(os.path.join(args.image_dir, "*.[jp][pn][g]")))
    if not image_files:
        print(f"No images found in {args.image_dir}")
        amlnn.uninit()
        return

    res_dir = "gesture_result"
    os.makedirs(res_dir, exist_ok=True)

    for idx, img_path in enumerate(image_files, start=1):
        print("=" * 60)
        print(f"Processing image {idx}/{len(image_files)}: {Path(img_path).name}")
        print("=" * 60)

        img = cv2.imread(img_path)
        if img is None:
            print(f"Failed to read: {img_path}")
            continue

        boxes, scores, class_ids = infer_bgr(
            amlnn, img,
            conf_thresh=args.conf_thres,
            nms_thresh=args.nms_thres
        )

        if args.top1_only and len(boxes) > 0:
            max_idx = int(np.argmax(np.array(scores)))
            boxes = [boxes[max_idx]]
            scores = [scores[max_idx]]
            class_ids = [class_ids[max_idx]]

        if len(boxes) == 0:
            print("    No objects detected")
            vis = img.copy()
        else:
            print(f"    Detected {len(boxes)} objects:")
            for i, (box, score, cid) in enumerate(zip(boxes, scores, class_ids), 1):
                print(f"      {i}. class={NAMES[int(cid)]}")
                print(f"         score={float(score):.3f}")
                print(f"         box={list(map(int, box))}")
            vis = draw_detections(img, np.array(boxes), np.array(scores), np.array(class_ids))

        save_path = os.path.join(res_dir, Path(img_path).name)
        cv2.imwrite(save_path, vis)
        print(f"    Result saved to: {save_path}")

    if args.loglevel == 'INFO':
        print("\nPerformance analysis visualization starting...")

    amlnn.visualize()
    amlnn.uninit()


if __name__ == "__main__":
    main()