amlnn-model-playground/examples/clip/py/clip.py

#
# Copyright (C) 2026 Amlogic, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import argparse
import numpy as np
from PIL import Image
from transformers import CLIPTokenizer
from amlnnlite.api import AMLNNLite

# ==================== Utility Functions ====================

def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
    """Compute softmax values for array x."""
    x = x - np.max(x, axis=axis, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=axis, keepdims=True)


def l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
    """L2 normalize array x along specified axis."""
    return x / (np.linalg.norm(x, axis=axis, keepdims=True) + eps)

# ==================== Vision Preprocessing ====================

def preprocess_image(image_path: str, target_size: int = 224) -> np.ndarray:
    """
    Preprocess image for CLIP model.

    Args:
        image_path (str): Path to input image
        target_size (int): Target image size (default: 224)

    Returns:
        np.ndarray: Preprocessed image data with shape (1, target_size, target_size, 3) in NHWC format
    """
    image = Image.open(image_path).convert("RGB")
    width, height = image.size

    # Scale the shorter side
    scale = target_size / min(width, height)
    new_width = int(width * scale)
    new_height = int(height * scale)
    image_resized = image.resize((new_width, new_height), resample=Image.BICUBIC)

    # Center crop
    left = (new_width - target_size) // 2
    top = (new_height - target_size) // 2
    right = left + target_size
    bottom = top + target_size
    image_cropped = image_resized.crop((left, top, right, bottom))

    # Convert to numpy array and normalize to [0, 1]
    image_np = np.array(image_cropped).astype(np.float32) / 255.0

    # CLIP normalization
    mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
    std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)
    image_np = (image_np - mean) / std

    # Add batch dimension: HWC -> NHWC
    image_np = np.expand_dims(image_np, axis=0)

    return image_np.astype(np.float32)  # [1, 224, 224, 3]

# ==================== Text Preprocessing ====================

def preprocess_text(tokenizer: CLIPTokenizer, text: str, max_len: int = 64) -> np.ndarray:
    """
    Preprocess text for CLIP model using CLIPTokenizer.

    Args:
        tokenizer: CLIPTokenizer instance
        text (str): Input text string
        max_len (int): Maximum sequence length (default: 64)

    Returns:
        np.ndarray: Tokenized text with shape (1, max_len) as int64
    """
    enc = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="np",
    )
    # text model input: int64[1, max_len]
    input_ids = enc["input_ids"].astype(np.int64)
    return input_ids

# ==================== Model Inference ====================

def compute_image_embedding(vision_amlnn: AMLNNLite, image_path: str) -> np.ndarray:
    """
    Compute image embedding using vision model.

    Args:
        vision_amlnn: AMLNNLite instance for vision model
        image_path (str): Path to input image

    Returns:
        np.ndarray: L2-normalized image embedding with shape (1, embed_dim)
    """
    input_data = preprocess_image(image_path)  # [1, 224, 224, 3]

    outputs = vision_amlnn.inference(
        inputs=[input_data],
        inputs_data_format='NHWC',
        outputs_data_format='NHWC'
    )

    feats = outputs[0].astype(np.float32)
    feats = feats.reshape(1, -1)  # Squeeze to [1, embed_dim]
    return l2_normalize(feats, axis=1)

def compute_text_embedding(text_amlnn: AMLNNLite, tokenizer: CLIPTokenizer, text: str, max_len: int = 64) -> np.ndarray:
    """
    Compute text embedding using text model.

    Args:
        text_amlnn: AMLNNLite instance for text model
        tokenizer: CLIPTokenizer instance
        text (str): Input text string
        max_len (int): Maximum sequence length

    Returns:
        np.ndarray: L2-normalized text embedding with shape (1, embed_dim)
    """
    input_ids = preprocess_text(tokenizer, text, max_len)  # [1, max_len]
    print(f"input_ids: {input_ids}")

    # AMLNNLite requires 4D input, reshape to (1, 1, 1, max_len)
    input_ids_4d = input_ids[:, None, None, :]  # [1, 1, 1, max_len]

    outputs = text_amlnn.inference(
        inputs=[input_ids_4d],
        inputs_data_format='NHWC',
        outputs_data_format='NHWC'
    )

    feats = outputs[0].astype(np.float32)
    feats = feats.reshape(1, -1)  # Squeeze to [1, embed_dim]
    return l2_normalize(feats, axis=1)

def compute_text_embeddings_batch(text_amlnn: AMLNNLite, tokenizer: CLIPTokenizer, texts: list, max_len: int = 64) -> np.ndarray:
    """
    Compute text embeddings for multiple texts.

    Args:
        text_amlnn: AMLNNLite instance for text model
        tokenizer: CLIPTokenizer instance
        texts (list): List of input text strings
        max_len (int): Maximum sequence length

    Returns:
        np.ndarray: L2-normalized text embeddings with shape (num_texts, embed_dim)
    """
    embeddings = []
    for text in texts:
        emb = compute_text_embedding(text_amlnn, tokenizer, text, max_len)
        embeddings.append(emb[0])  # Remove batch dimension
    return np.stack(embeddings, axis=0)  # [num_texts, embed_dim]

# ==================== Similarity Calculation ====================

def compute_similarity(image_embedding: np.ndarray, text_embeddings: np.ndarray, logit_scale: float = 100.0) -> tuple:
    """
    Compute similarity between image and text embeddings.

    Args:
        image_embedding (np.ndarray): Image embedding with shape (1, embed_dim)
        text_embeddings (np.ndarray): Text embeddings with shape (num_texts, embed_dim)
        logit_scale (float): Scale factor for logits

    Returns:
        tuple: (similarities, logits, probabilities)
    """
    # Cosine similarity (embeddings are already L2-normalized)
    sims = text_embeddings @ image_embedding[0]  # [num_texts]
    logits = sims * logit_scale  # [num_texts]
    probs = softmax(logits, axis=0)  # [num_texts]

    return sims, logits, probs

# ==================== Main Function ====================

def main():
    parser = argparse.ArgumentParser(description='CLIP Image-Text Matching Demo using AMLNNLite')
    parser.add_argument('--vision-model', required=True, help='Path to vision model (.adla)')
    parser.add_argument('--text-model', required=True, help='Path to text model (.adla)')
    parser.add_argument('--tokenizer-dir', required=True, help='Path to CLIPTokenizer directory')
    parser.add_argument('--image-path', default=None, help='Path to input image (optional, will prompt if not provided)')
    parser.add_argument('--texts', nargs='+', default=None, help='List of text descriptions to compare')
    parser.add_argument('--max-len', type=int, default=64, help='Maximum token sequence length (default: 64)')
    parser.add_argument('--logit-scale', type=float, default=100.0, help='Logit scale factor (default: 100.0)')

    args = parser.parse_args()

    # Validate model paths
    if not os.path.exists(args.vision_model):
        print(f"[Error] Vision model not found: {args.vision_model}")
        return -1

    if not os.path.exists(args.text_model):
        print(f"[Error] Text model not found: {args.text_model}")
        return -1

    # Load tokenizer
    print(f"[Info] Loading CLIPTokenizer from: {args.tokenizer_dir}")
    tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_dir)

    # Initialize vision model
    print(f"[Info] Initializing vision model: {args.vision_model}")
    vision_amlnn = AMLNNLite()
    vision_amlnn.config(model_path=args.vision_model, run_cycles=1)
    vision_amlnn.init()

    # Initialize text model
    print(f"[Info] Initializing text model: {args.text_model}")
    text_amlnn = AMLNNLite()
    text_amlnn.config(model_path=args.text_model, run_cycles=1)
    text_amlnn.init()

    print("[Info] Models initialized successfully.\n")

    try:
        # Interactive loop
        while True:
            # Get image path
            if args.image_path:
                image_path = args.image_path
                args.image_path = None  # Clear for next iteration
            else:
                print("=" * 60)
                print("[Info] Image Path (or 'exit' to quit):")
                image_path = input().strip()

            # Check for exit
            if image_path.lower() == 'exit':
                print("[Info] Exiting...")
                break

            # Validate image path
            if not image_path:
                print("[Warning] Please enter an image path.")
                continue

            if not os.path.exists(image_path):
                print(f"[Error] Image not found: {image_path}")
                continue

            # Get texts to compare
            if args.texts:
                texts = args.texts
                args.texts = None  # Clear for next iteration
            else:
                print("[Info] Enter text descriptions (comma-separated, or 'skip' to use defaults):")
                text_input = input().strip()

                if text_input.lower() == 'skip' or not text_input:
                    # Default texts for demo
                    texts = [
                        "a red handbag",
                        "a blue jacket",
                        "a red bus",
                    ]
                    print(f"[Info] Using default texts: {texts}")
                else:
                    texts = [t.strip() for t in text_input.split(',') if t.strip()]

            if not texts:
                print("[Warning] No texts provided.")
                continue

            try:
                # Compute image embedding
                print(f"\n[Info] Processing image: {image_path}")
                image_embedding = compute_image_embedding(vision_amlnn, image_path)
                print(f"[Info] Image embedding shape: {image_embedding.shape}")

                # Compute text embeddings
                print(f"[Info] Processing {len(texts)} text(s)...")
                text_embeddings = compute_text_embeddings_batch(text_amlnn, tokenizer, texts, args.max_len)
                print(f"[Info] Text embeddings shape: {text_embeddings.shape}")

                # Compute similarity
                sims, logits, probs = compute_similarity(image_embedding, text_embeddings, args.logit_scale)

                # Print results
                print("\n" + "=" * 60)
                print("CLIP Image-Text Matching Results")
                print("=" * 60)
                print(f"Image: {image_path}")
                print(f"logit_scale: {args.logit_scale:.6f}")
                print("-" * 60)

                # Sort by probability (descending)
                sorted_indices = np.argsort(probs)[::-1]
                for rank, i in enumerate(sorted_indices):
                    print(f"[{rank + 1}] prob={probs[i]:.6f}  sim={float(sims[i]):.6f}  text='{texts[i]}'")

                print("=" * 60 + "\n")

            except Exception as e:
                print(f"[Error] Processing failed: {e}")
                import traceback
                traceback.print_exc()
                continue

    except KeyboardInterrupt:
        print("\n\n[Info] Interrupted by user. Exiting...")

    finally:
        # Cleanup
        vision_amlnn.uninit()
        text_amlnn.uninit()

    print("[Info] Done.")
    return 0

if __name__ == "__main__":
    import sys
    sys.exit(main())