amlnn-model-playground/examples/clip/py/clip.py

# -*- coding: utf-8 -*-
"""
Copyright (C) 2024–2025 Amlogic, Inc. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

# This inference script is designed for CLIP model using AMLNNLite.

import os
import argparse
import numpy as np
from PIL import Image
from transformers import CLIPTokenizer
from amlnnlite.api import AMLNNLite

# ==================== Utility Functions ====================

def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
    """Compute softmax values for array x."""
    x = x - np.max(x, axis=axis, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=axis, keepdims=True)


def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
    """L2 normalize array x along specified axis."""
    return x / (np.linalg.norm(x, axis=axis, keepdims=True) + eps)

# ==================== Vision Preprocessing ====================

def preprocess_image(image_path: str, target_size: int = 224) -> np.ndarray:
    """
    Preprocess image for CLIP model.

    Args:
        image_path (str): Path to input image
        target_size (int): Target image size (default: 224)

    Returns:
        np.ndarray: Preprocessed image data with shape (1, target_size, target_size, 3) in NHWC format
    """
    image = Image.open(image_path).convert("RGB")
    width, height = image.size

    # Scale the shorter side
    scale = target_size / min(width, height)
    new_width = int(width * scale)
    new_height = int(height * scale)
    image_resized = image.resize((new_width, new_height), resample=Image.BICUBIC)

    # Center crop
    left = (new_width - target_size) // 2
    top = (new_height - target_size) // 2
    right = left + target_size
    bottom = top + target_size
    image_cropped = image_resized.crop((left, top, right, bottom))

    # Convert to numpy array and normalize to [0, 1]
    image_np = np.array(image_cropped).astype(np.float32) / 255.0

    # CLIP normalization
    mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
    std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
    image_np = (image_np - mean) / std

    # Add batch dimension: HWC -> NHWC
    image_np = np.expand_dims(image_np, axis=0)

    return image_np.astype(np.float32)  # [1, 224, 224, 3]

# ==================== Text Preprocessing ====================

def preprocess_text(tokenizer: CLIPTokenizer, text: str, max_len: int = 64) -> np.ndarray:
    """
    Preprocess text for CLIP model using CLIPTokenizer.

    Args:
        tokenizer: CLIPTokenizer instance
        text (str): Input text string
        max_len (int): Maximum sequence length (default: 64)

    Returns:
        np.ndarray: Tokenized text with shape (1, max_len) as int64
    """
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="np",
    )
    # text model input: int64[1, max_len]
    input_ids = enc["input_ids"].astype(np.int64)
    return input_ids

# ==================== Model Inference ====================

def compute_image_embedding(vision_amlnn: AMLNNLite, image_path: str) -> np.ndarray:
    """
    Compute image embedding using vision model.

    Args:
        vision_amlnn: AMLNNLite instance for vision model
        image_path (str): Path to input image

    Returns:
        np.ndarray: L2-normalized image embedding with shape (1, embed_dim)
    """
    input_data = preprocess_image(image_path)  # [1, 224, 224, 3]

    outputs = vision_amlnn.inference(
        inputs=[input_data],
        inputs_data_format='NHWC',
        outputs_data_format='NHWC'
    )

    feats = outputs[0].astype(np.float32)
    feats = feats.reshape(1, -1)  # Squeeze to [1, embed_dim]
    return l2_normalize(feats, axis=1)

def compute_text_embedding(text_amlnn: AMLNNLite, tokenizer: CLIPTokenizer, text: str, max_len: int = 64) -> np.ndarray:
    """
    Compute text embedding using text model.

    Args:
        text_amlnn: AMLNNLite instance for text model
        tokenizer: CLIPTokenizer instance
        text (str): Input text string
        max_len (int): Maximum sequence length

    Returns:
        np.ndarray: L2-normalized text embedding with shape (1, embed_dim)
    """
    input_ids = preprocess_text(tokenizer, text, max_len)  # [1, max_len]
    print(f"input_ids: {input_ids}")

    # AMLNNLite requires 4D input, reshape to (1, 1, 1, max_len)
    input_ids_4d = input_ids[:, None, None, :]  # [1, 1, 1, max_len]

    outputs = text_amlnn.inference(
        inputs=[input_ids_4d],
        inputs_data_format='NHWC',
        outputs_data_format='NHWC'
    )

    feats = outputs[0].astype(np.float32)
    feats = feats.reshape(1, -1)  # Squeeze to [1, embed_dim]
    return l2_normalize(feats, axis=1)

def compute_text_embeddings_batch(text_amlnn: AMLNNLite, tokenizer: CLIPTokenizer, texts: list, max_len: int = 64) -> np.ndarray:
    """
    Compute text embeddings for multiple texts.

    Args:
        text_amlnn: AMLNNLite instance for text model
        tokenizer: CLIPTokenizer instance
        texts (list): List of input text strings
        max_len (int): Maximum sequence length

    Returns:
        np.ndarray: L2-normalized text embeddings with shape (num_texts, embed_dim)
    """
    embeddings = []
    for text in texts:
        emb = compute_text_embedding(text_amlnn, tokenizer, text, max_len)
        embeddings.append(emb[0])  # Remove batch dimension
    return np.stack(embeddings, axis=0)  # [num_texts, embed_dim]

# ==================== Similarity Calculation ====================

def compute_similarity(image_embedding: np.ndarray, text_embeddings: np.ndarray, logit_scale: float = 100.0) -> tuple:
    """
    Compute similarity between image and text embeddings.

    Args:
        image_embedding (np.ndarray): Image embedding with shape (1, embed_dim)
        text_embeddings (np.ndarray): Text embeddings with shape (num_texts, embed_dim)
        logit_scale (float): Scale factor for logits

    Returns:
        tuple: (similarities, logits, probabilities)
    """
    # Cosine similarity (embeddings are already L2-normalized)
    sims = text_embeddings @ image_embedding[0]  # [num_texts]
    logits = sims * logit_scale  # [num_texts]
    probs = softmax(logits, axis=0)  # [num_texts]

    return sims, logits, probs

# ==================== Main Function ====================

def main():
    parser = argparse.ArgumentParser(description='CLIP Image-Text Matching Demo using AMLNNLite')
    parser.add_argument('--vision-model', required=True, help='Path to vision model (.adla)')
    parser.add_argument('--text-model', required=True, help='Path to text model (.adla)')
    parser.add_argument('--tokenizer-dir', required=True, help='Path to CLIPTokenizer directory')
    parser.add_argument('--image-path', default=None, help='Path to input image (optional, will prompt if not provided)')
    parser.add_argument('--texts', nargs='+', default=None, help='List of text descriptions to compare')
    parser.add_argument('--max-len', type=int, default=64, help='Maximum token sequence length (default: 64)')
    parser.add_argument('--logit-scale', type=float, default=100.0, help='Logit scale factor (default: 100.0)')

    args = parser.parse_args()

    # Validate model paths
    if not os.path.exists(args.vision_model):
        print(f"[Error] Vision model not found: {args.vision_model}")
        return -1

    if not os.path.exists(args.text_model):
        print(f"[Error] Text model not found: {args.text_model}")
        return -1

    # Load tokenizer
    print(f"[Info] Loading CLIPTokenizer from: {args.tokenizer_dir}")
    tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_dir)

    # Initialize vision model
    print(f"[Info] Initializing vision model: {args.vision_model}")
    vision_amlnn = AMLNNLite()
    vision_amlnn.config(model_path=args.vision_model, run_cycles=1)
    vision_amlnn.init()

    # Initialize text model
    print(f"[Info] Initializing text model: {args.text_model}")
    text_amlnn = AMLNNLite()
    text_amlnn.config(model_path=args.text_model, run_cycles=1)
    text_amlnn.init()

    print("[Info] Models initialized successfully.\n")

    try:
        # Interactive loop
        while True:
            # Get image path
            if args.image_path:
                image_path = args.image_path
                args.image_path = None  # Clear for next iteration
            else:
                print("=" * 60)
                print("[Info] Image Path (or 'exit' to quit):")
                image_path = input().strip()

            # Check for exit
            if image_path.lower() == 'exit':
                print("[Info] Exiting...")
                break

            # Validate image path
            if not image_path:
                print("[Warning] Please enter an image path.")
                continue

            if not os.path.exists(image_path):
                print(f"[Error] Image not found: {image_path}")
                continue

            # Get texts to compare
            if args.texts:
                texts = args.texts
                args.texts = None  # Clear for next iteration
            else:
                print("[Info] Enter text descriptions (comma-separated, or 'skip' to use defaults):")
                text_input = input().strip()

                if text_input.lower() == 'skip' or not text_input:
                    # Default texts for demo
                    texts = [
                        "a red handbag",
                        "a blue jacket",
                        "a red bus",
                    ]
                    print(f"[Info] Using default texts: {texts}")
                else:
                    texts = [t.strip() for t in text_input.split(',') if t.strip()]

            if not texts:
                print("[Warning] No texts provided.")
                continue

            try:
                # Compute image embedding
                print(f"\n[Info] Processing image: {image_path}")
                image_embedding = compute_image_embedding(vision_amlnn, image_path)
                print(f"[Info] Image embedding shape: {image_embedding.shape}")

                # Compute text embeddings
                print(f"[Info] Processing {len(texts)} text(s)...")
                text_embeddings = compute_text_embeddings_batch(text_amlnn, tokenizer, texts, args.max_len)
                print(f"[Info] Text embeddings shape: {text_embeddings.shape}")

                # Compute similarity
                sims, logits, probs = compute_similarity(image_embedding, text_embeddings, args.logit_scale)

                # Print results
                print("\n" + "=" * 60)
                print("CLIP Image-Text Matching Results")
                print("=" * 60)
                print(f"Image: {image_path}")
                print(f"logit_scale: {args.logit_scale:.6f}")
                print("-" * 60)

                # Sort by probability (descending)
                sorted_indices = np.argsort(probs)[::-1]
                for rank, i in enumerate(sorted_indices):
                    print(f"[{rank + 1}] prob={probs[i]:.6f}  sim={float(sims[i]):.6f}  text='{texts[i]}'")

                print("=" * 60 + "\n")

            except Exception as e:
                print(f"[Error] Processing failed: {e}")
                import traceback
                traceback.print_exc()
                continue

    except KeyboardInterrupt:
        print("\n\n[Info] Interrupted by user. Exiting...")

    finally:
        # Cleanup
        vision_amlnn.uninit()
        text_amlnn.uninit()

    print("[Info] Done.")
    return 0

if __name__ == "__main__":
    import sys
    sys.exit(main())