diff --git a/dependency/nnsdk/include/nn_sdk.h b/dependency/nnsdk/include/nn_sdk.h
new file mode 100755
index 0000000..0f350a3
--- /dev/null
+++ b/dependency/nnsdk/include/nn_sdk.h
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (C) 2024–2025 Amlogic, Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _NN_SDK_H
+#define _NN_SDK_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*=============================================================
+                    Macro Definitions
+==============================================================*/
+#define AML_NN_SDK_VERSION          "2.8.5"
+
+#define MAX_NAME_LENGTH             64
+#define INPUT_MAX_NUM               64
+#define OUTPUT_MAX_NUM              64
+#define INPUT_CHANNEL               3
+#define MAX_DETECT_NUM              230
+#define SUPPORT_NET_NUM             60
+#define ADDRESS_MAX_NUM             64
+#define MAX_TENSOR_NUM_DIMS         6
+#define INPUT_META_NUM              1
+#define DELEGATE_MAX_INPUT_NUM      128
+#define DELEGATE_MAX_OUTPUT_NUM     128
+
+/*=====================================================
+the common type for sdk api
+======================================================*/
+typedef enum {
+    AML_IN_PICTURE            = 0,
+    AML_IN_VIDEO              = 1,
+    AML_IN_CAMERA             = 2
+} amlnn_input_mode_t;
+
+typedef enum _amlnn_model_ {
+    CAFFE                     = 0,
+    TENSORFLOW,
+    TENSORFLOWLITE,
+    DARKNET,
+    ONNX,
+    KERAS,
+    PYTORCH,
+    ADLA_LOADABLE,
+    MEDEL_MAX
+} amlnn_model_type;
+
+/*=================================================================================
+load adla type
+==================================================================================*/
+typedef enum _amlnn_nbg_type_ {
+    NN_NBG_FILE               = 0,
+    NN_NBG_MEMORY             = 1,
+    NN_RUNTIME_FILE           = 2,
+    NN_RUNTIME_MEMORY         = 3,
+    NN_ADLA_FILE              = 4,
+    NN_ADLA_MEMORY            = 5
+} amlnn_nbg_type;
+
+typedef enum _amlnn_input_ {
+    RGB24_RAW_DATA = 0,
+    TENSOR_RAW_DATA,
+    QTENSOR_RAW_DATA,
+    BINARY_RAW_DATA,
+    INPUT_DMA_DATA,
+    INPUT_DMA_SECURE_DATA,
+    NV12_RAW_DATA,
+    RAW_DATA_MAX
+} amlnn_input_type;
+
+typedef enum _nn_buffer_format_e
+{
+    /*! \brief A float type of buffer data */
+    NN_BUFFER_FORMAT_FP32       = 0,
+    /*! \brief A half float type of buffer data */
+    NN_BUFFER_FORMAT_FP16       = 1,
+    /*! \brief A 8 bit unsigned integer type of buffer data */
+    NN_BUFFER_FORMAT_UINT8      = 2,
+    /*! \brief A 8 bit signed integer type of buffer data */
+    NN_BUFFER_FORMAT_INT8       = 3,
+    /*! \brief A 16 bit unsigned integer type of buffer data */
+    NN_BUFFER_FORMAT_UINT16     = 4,
+    /*! \brief A 16 signed integer type of buffer data */
+    NN_BUFFER_FORMAT_INT16      = 5,
+    /*! \brief A 32 signed integer type of buffer data */
+    NN_BUFFER_FORMAT_INT32      = 8,
+    /*! \brief A 64 signed integer type of buffer data */
+    NN_BUFFER_FORMAT_INT64      = 10,
+    /*! \brief A bool type of buffer data */
+    NN_BUFFER_FORMAT_BOOL       = 11,
+} nn_buffer_format_e;
+
+typedef enum _nn_buffer_quantize_format_e
+{
+    /*! \brief Not quantized format */
+    NN_BUFFER_QUANTIZE_NONE                    = 0,
+    /*! \brief The data is quantized with dynamic fixed point */
+    NN_BUFFER_QUANTIZE_DYNAMIC_FIXED_POINT     = 1,
+    /*! \brief The data is quantized with TF asymmetric format */
+    NN_BUFFER_QUANTIZE_TF_ASYMM                = 2
+} nn_buffer_quantize_format_e;
+
+typedef enum {
+    IMAGE_CLASSIFY      = 0,
+    OBJECT_DETECT       = 1,    /// object_detection
+    FACE_DETECTION      = 2,    ///
+    FACE_LANDMARK_5     = 3,    /// landmark 5 points
+    FACE_LANDMARK_68    = 4,    ///< Face Landmark (68 KeyPoints)
+    FACE_RECOGNIZE      = 5,    ///< Face Recognition
+    FACE_COMPARISON     = 6,    ///< Face compare,whether same person
+    FACE_AGE            = 7,    ///< Face age
+    FACE_GENDER         = 8,    ///< Face gender
+    FACE_EMOTION        = 9,    ///< Face emotion
+    BODY_POSE           = 10,   ///< Body Pose
+    FINGER_POSE         = 11,   ///< Finger Landmark(21 KeyPoint)
+    HEAD_DETECTION      = 12,   ///< Head Detection
+    CARPLATE_DETECTION  = 13,   ///< Car Plate Detection
+    CARPLATE_RECOG      = 14,   ///< Car Plate Recognition
+    TEXT_DETECTION      = 15,   ///< text region detect
+    IMAGE_SR            = 16,   ///< image SR
+    IMAGE_SEGMENTATION  = 17,   ///< image segment, based on icnet
+    PERSON_DETECT       = 18,   ///< person detect
+    YOLOFACE_V2         = 19,
+    YOLO_V2             = 20,
+    YOLO_V3             = 21,
+    FACE_NET            = 22,
+    FACE_RECOG_U        = 23,
+    FACE_RFB_DETECTION  = 24,
+    AML_PERSON_DETECT   = 25,
+    AML_BODY_POSE       = 26,
+    mobilenetv2         = 30,
+    centerface          = 31,
+    ALPD                = 32,
+    ALPR                = 33,
+    face_blur_v2        = 34,
+    face_recog_small    = 35,
+    face_reid_v2        = 36,
+    landmark2d_106      = 37,
+    new_car             = 38,
+    old_person          = 39,
+    CUSTOM_NETWORK      = 99,   ///< custom network, for user development
+    MODEL_MAX           = 100    ///< max model number
+} aml_module_t;
+
+typedef enum {
+    AML_OUTDATA_FLOAT32      = 0,
+    AML_OUTDATA_RAW          = 1,
+    AML_OUTDATA_DMA          = 2,
+    AML_OUTDATA_DMA_SECURE   = 3
+} aml_output_format_t;
+
+typedef enum {
+    AML_NO_PERF                  = 0,
+    AML_PERF_INFERENCE           = 1,
+    AML_PERF_OUTPUT_GET          = 2,
+    AML_PERF_OUTPUT_SET          = 3,
+    AML_PERF_RESET_TRANSFORMER   = 4,
+    AML_PERF_BREAK_TRANSFORMER   = 5
+} aml_perf_mode_t;
+
+typedef enum {
+    AML_PROFILE_NONE         = 0,
+    AML_PROFILE_PERFORMANCE  = 1,
+    AML_PROFILE_BANDWIDTH    = 2,
+    AML_PROFILE_MEMORY       = 3,
+    AML_PERLAYER_RUNTIME     = 4,
+    AML_PERLAYER_BANDWIDTH   = 5,
+    AML_PERLAYER_OUTPUT      = 6,
+    AML_PERLAYER_INPUT       = 7
+} aml_profile_type_t;
+
+typedef enum {
+    AML_PERFORMANCE_MODE         = 1,
+    AML_POWER_SAVE_MODE          = 2,
+    AML_MINIMUM_POWER_MODE       = 3
+} aml_policy_type_t;
+
+typedef enum {
+    AML_IO_VIRTUAL          = 0,
+    AML_IO_PHYS             = 1,
+    AML_IO_VIRTUAL_SECURE   = 2,
+    AML_IO_PHYS_SECURE      = 3
+} aml_io_format_t;
+
+typedef enum {
+    AML_OUTPUT_ORDER_DEFAULT      = 0,    //output format:  nhwc for adla, nchw for galcore
+    AML_OUTPUT_ORDER_NHWC         = 1,    //output format: nhwc
+    AML_OUTPUT_ORDER_NCHW         = 2,    //output format: nchw
+} aml_output_order_t;
+
+typedef struct __aml_kvcache_dynamic_val_t
+{
+    int32_t current_mask;
+} aml_kvcache_dynamic_val_t;
+
+typedef struct __kvCacheDynamicInfo_t
+{
+    bool                          update_kvcache_info_flag;
+    aml_kvcache_dynamic_val_t     kvcache_dynamic_val;
+} kvCacheDynamicInfo_t;
+
+typedef struct __aml_transformer_model_info
+{
+    uint64_t eos_token;
+    int32_t max_sequence_length;
+} aml_transformer_model_info;
+
+typedef  struct __aml_invoke_info_t
+{
+    int typeSize;
+    int invoke_type; // 1: invoke_no_wait, 2: waitwithid
+    int32_t timeout; //ms
+    int64_t invoke_id;
+    kvCacheDynamicInfo_t kvcache_dynamic_info;
+} aml_invoke_info_t;
+
+typedef  struct __amlnn_module_out_data_t
+{
+    int typeSize;
+    aml_module_t mdType;
+    aml_perf_mode_t perfMode;
+    aml_output_format_t format;
+    aml_output_order_t order;
+    aml_invoke_info_t invoke;
+} aml_output_config_t;
+
+typedef struct _nn_buffer_create_params_t
+{
+    unsigned int     num_of_dims; /*!< \brief The number of dimensions specified in *sizes*/
+    unsigned int      sizes[4];       /*!< \brief The pointer to an array of dimension */
+    nn_buffer_format_e          data_format; /*!< \brief Data format for the tensor, see <tt>\ref nn_buffer_format_e </tt> */
+    nn_buffer_quantize_format_e quant_format; /*!< \brief Quantized format see <tt>\ref nn_buffer_quantize_format_e </tt>. */
+    union {
+        struct {
+            unsigned char fixed_point_pos; /*!< \brief Specifies the fixed point position when the input element type is int16, if 0 calculations are performed in integer math */
+        } dfp;
+
+        struct {
+            float        scale;       /*!< \brief Scale value for the quantized value */
+            unsigned int zeroPoint;  /*!< \brief  A 32 bit integer, in range [0, 255] */
+        } affine;
+     }
+     quant_data; /*<! \brief The union of quantization information */
+} nn_buffer_params_t;
+
+typedef enum {
+    AML_INPUT_DEFAULT      = 0,    //channel format: caffe 2 1 0 ,others 0 1 2
+    AML_INPUT_MODEL_NHWC   = 1,    //channel format: 0 1 2
+    AML_INPUT_MODEL_NCHW   = 2,    //channel format: 2 1 0
+} aml_input_format_t;
+
+typedef struct out_buf
+{
+    unsigned int size;
+    char  name[MAX_NAME_LENGTH];     //output tensor name
+    unsigned char *buf;
+    nn_buffer_params_t *param;
+    aml_output_format_t out_format;
+    int output_valid_length;
+} outBuf_t;
+
+typedef struct __nnout
+{
+    int typeSize;
+    unsigned int num;   /*===========output tensor number============*/
+    outBuf_t out[OUTPUT_MAX_NUM];
+} nn_output;
+
+typedef enum {
+    AML_INPUT_U8,
+    AML_INPUT_I8,
+    AML_INPUT_FP32,
+} aml_input_data_type_t;
+
+typedef struct
+{
+    int valid;
+    int int16_type;
+    int preprocess_debug;
+    float mean[INPUT_CHANNEL];
+    float scale;
+    aml_input_format_t input_format;
+    aml_input_data_type_t input_data_type;
+}input_info;
+
+typedef struct __nn_input
+{
+    int  typeSize;
+    int  input_index;
+    int  size;
+    unsigned char* input;
+    amlnn_input_type input_type;
+    input_info info;
+    int subgraph_index;
+    int input_valid_length;
+}nn_input;
+
+typedef struct __assign_address
+{
+    unsigned int inAddr_size;
+    unsigned int outAddr_size;
+    aml_io_format_t io_type;
+    unsigned char* inAddr[ADDRESS_MAX_NUM];
+    unsigned char* outAddr[ADDRESS_MAX_NUM];
+}assign_user_address_t;
+
+typedef enum __enCoreId
+{
+    AML_ID_0       = 0,
+    AML_ID_1       = 1,
+    AML_ID_2       = 2,
+    AML_ID_3       = 3,
+    AML_ID_4       = 4,
+    AML_ID_BUTT
+} aml_encore_id;
+
+typedef enum __aml_runtime_mode_t
+{
+    AML_RUNTIME_NNRT       = 0,
+    AML_RUNTIME_OVX        = 1,
+    AML_RUNTIME_TIM_VX     = 2,
+    RUN_TIME_MODE_BUTT
+} aml_runtime_mode_t;
+
+typedef enum {
+    AML_WITH_CACHE            = 0,
+    AML_WITHOUT_CACHE         = 1,
+} aml_cache_type_t;
+
+typedef enum {
+    AML_Add = 0,
+    AML_AveragePool2d = 1,
+    AML_Concatenation = 2,
+    AML_Conv2d = 3,
+    AML_DepthwiseConv2d = 4,
+    AML_DepthToSpace = 5,
+    AML_Dequantize = 6,
+    AML_EmbeddingLookup = 7,
+    AML_Floor = 8,
+    AML_FullyConnected = 9,
+    AML_HashtableLookup = 10,
+    AML_L2Normalization = 11,
+    AML_L2Pool2d = 12,
+    AML_LocalResponseNormalization = 13,
+    AML_Logistic = 14,
+    AML_LshProjection = 15,
+    AML_Lstm = 16,
+    AML_MaxPool2d = 17,
+    AML_Mul = 18,
+    AML_Relu = 19,
+    AML_ReluN1To1 = 20,
+    AML_Relu6 = 21,
+    AML_Reshape = 22,
+    AML_ResizeBilinear = 23,
+    AML_Rnn = 24,
+    AML_Softmax = 25,
+    AML_SpaceToDepth = 26,
+    AML_Svdf = 27,
+    AML_Tanh = 28,
+    AML_ConcatEmbeddings = 29,
+    AML_SkipGram = 30,
+    AML_Call = 31,
+    AML_Custom = 32,
+    AML_EmbeddingLookupSparse = 33,
+    AML_Pad = 34,
+    AML_UnidirectionalSequenceRnn = 35,
+    AML_Gather = 36,
+    AML_BatchToSpaceNd = 37,
+    AML_SpaceToBatchNd = 38,
+    AML_Transpose = 39,
+    AML_Mean = 40,
+    AML_Sub = 41,
+    AML_Div = 42,
+    AML_Squeeze = 43,
+    AML_UnidirectionalSequenceLstm = 44,
+    AML_StridedSlice = 45,
+    AML_BidirectionalSequenceRnn = 46,
+    AML_Exp = 47,
+    AML_TopkV2 = 48,
+    AML_Split = 49,
+    AML_LogSoftmax = 50,
+    AML_Delegate = 51,
+    AML_BidirectionalSequenceLstm = 52,
+    AML_Cast = 53,
+    AML_Prelu = 54,
+    AML_Maximum = 55,
+    AML_ArgMax = 56,
+    AML_Minimum = 57,
+    AML_Less = 58,
+    AML_Neg = 59,
+    AML_PadV2 = 60,
+    AML_Greater = 61,
+    AML_GreaterEqual = 62,
+    AML_LessEqual = 63,
+    AML_Select = 64,
+    AML_Slice = 65,
+    AML_Sin = 66,
+    AML_TransposeConv = 67,
+    AML_SparseToDense = 68,
+    AML_Tile = 69,
+    AML_ExpandDims = 70,
+    AML_Equal = 71,
+    AML_NotEqual = 72,
+    AML_Log = 73,
+    AML_Sum = 74,
+    AML_Sqrt = 75,
+    AML_Rsqrt = 76,
+    AML_Shape = 77,
+    AML_Pow = 78,
+    AML_ArgMin = 79,
+    AML_FakeQuant = 80,
+    AML_ReduceProd = 81,
+    AML_ReduceMax = 82,
+    AML_Pack = 83,
+    AML_LogicalOr = 84,
+    AML_OneHot = 85,
+    AML_LogicalAnd = 86,
+    AML_LogicalNot = 87,
+    AML_Unpack = 88,
+    AML_ReduceMin = 89,
+    AML_FloorDiv = 90,
+    AML_ReduceAny = 91,
+    AML_Square = 92,
+    AML_ZerosLike = 93,
+    AML_Fill = 94,
+    AML_FloorMod = 95,
+    AML_Range = 96,
+    AML_ResizeNearestNeighbor = 97,
+    AML_LeakyRelu = 98,
+    AML_SquaredDifference = 99,
+    AML_MirrorPad = 100,
+    AML_Abs = 101,
+    AML_SplitV = 102,
+    AML_Unique = 103,
+    AML_Ceil = 104,
+    AML_ReverseV2 = 105,
+    AML_AddN = 106,
+    AML_GatherNd = 107,
+    AML_Cos = 108,
+    AML_Where = 109,
+    AML_Rank = 110,
+    AML_Elu = 111,
+    AML_ReverseSequence = 112,
+    AML_MatrixDiag = 113,
+    AML_Quantize = 114,
+    AML_MatrixSetDiag = 115,
+    AML_Round = 116,
+    AML_HardSwish = 117,
+    AML_If = 118,
+    AML_While = 119,
+    AML_NonMaxSuppressionV4 = 120,
+    AML_NonMaxSuppressionV5 = 121,
+    AML_ScatterNd = 122,
+    AML_SelectV2 = 123,
+    AML_Densify = 124,
+    AML_SegmentSum = 125,
+    AML_BatchMatmul = 126,
+    AML_PlaceholderForGreaterOpCodes = 127,
+    AML_Cumsum = 128,
+    AML_CallOnce = 129,
+    AML_BroadcastTo = 130,
+    AML_Rfft2d = 131,
+    AML_Conv3d = 132,
+    AML_Imag = 133,
+    AML_Real = 134,
+    AML_ComplexAbs = 135,
+    AML_Hashtable = 136,
+    AML_HashtableFind = 137,
+    AML_HashtableImport = 138,
+    AML_HashtableSize = 139,
+    AML_ReduceAll = 140,
+    AML_Conv3dTranspose = 141,
+    AML_VarHandle = 142,
+    AML_ReadVariable = 143,
+    AML_AssignVariable = 144,
+    AML_BroadcastArgs = 145,
+    AML_RandomStandardNormal = 146,
+    AML_Bucketize = 147,
+    AML_RandomUniform = 148,
+    AML_Multinomial = 149,
+    AML_Gelu = 150,
+    AML_DynamicUpdateSlice = 151,
+    AML_Relu0To1 = 152,
+    AML_UnsortedSegmentProd = 153,
+    AML_UnsortedSegmentMax = 154,
+    AML_UnsortedSegmentSum = 155,
+    AML_Atan2 = 156,
+    AML_UnsortedSegmentMin = 157,
+    AML_Sign = 158,
+    AML_Bitcast = 159,
+    AML_BitwiseXor = 160,
+    AML_RightShift = 161,
+    AML_DetectionPostProcess = 256,
+    AML_Erf = 260,
+    AML_Hardware = 511,
+    AML_Unknown = 2147483647,
+    AML_MIN = AML_Add,
+    AML_MAX = AML_Unknown
+} aml_operator_t;
+
+typedef struct __aml_openmp_opt_t {
+    aml_operator_t     operator_type;
+    bool               enable_openmp;
+    bool               involve_all_ops;  // enable openmp for all operators.
+    int8_t             openmp_num;
+} aml_openmp_opt_t;
+
+typedef struct __aml_neon_opt_t {
+    aml_operator_t     operator_type;
+    bool               enable_neon;
+    bool               involve_all_ops;
+} aml_neon_opt_t;
+
+typedef struct __softOpInfo_t {
+    bool                  set_openmp_opt_flag;
+    int                   openmp_opt_num;
+    aml_openmp_opt_t*     openmp_opt;
+    bool                  set_neon_opt_flag;
+    int                   neon_opt_num;
+    aml_neon_opt_t*       neon_opt;
+} softOpInfo_t;
+
+typedef enum __aml_kvcache_type_t
+{
+    KVCompute_Prune = 1,
+    KVTransformer_Accel = 2
+} aml_kvcache_type_t;
+
+typedef struct __aml_kvcache_opt_t {
+    int32_t            operator_index;
+    bool               enable_kvcache; // enable skipping invalid vector computations outside the range of ADLA_KVCACHE_DYNAMIC_VAL.current_mask.
+    bool               zero_out_invalid_value; // set output tensors partial values to zero outside the range of ADLA_KVCACHE_DYNAMIC_VAL.current_mask,
+                                               // When the software operator(enable skip) is followed by a operator(disable skip), it must be set to true to ensure that the result is correct.
+    int8_t             active_axis;
+    int32_t            active_axis_size;
+} aml_kvcache_opt_t;
+
+typedef struct __kvCacheInfo_t {
+    bool                  set_kvcache_opt_flag;
+    int                   kvcache_opt_num;
+    aml_kvcache_opt_t*    kvcache_opt;
+} kvCacheInfo_t;
+
+typedef struct __aml_forward_ctrl_t
+{
+    aml_encore_id              enCoreId;       /* device target which running the seg*/
+    int64_t                    invoke_id;
+    int32_t                    timeout_ms;
+    softOpInfo_t               softop_info;
+    aml_kvcache_type_t         kvcache_type;
+    kvCacheInfo_t              kvcache_info;
+} aml_forward_ctrl_t;
+
+typedef enum __aml_model_type_t
+{
+    AML_MODEL_TYPE_ADLA_LOADABLE = 0,
+    AML_MODEL_TYPE_TENSORFLOW,
+    AML_MODEL_TYPE_TENSORFLOW_LITE
+} aml_model_type_t;
+
+typedef enum __aml_model_in_out_type_t
+{
+    AML_MODEL_IN_OUT_TYPE_MEMORY = 0,
+    AML_MODEL_IN_OUT_TYPE_FILE
+} aml_model_in_out_type_t;
+
+typedef enum __aml_compiler_optimization_mode_t
+{
+    AML_COMPILER_OPTIMIZATION_MODE_FAST = 0,
+    AML_COMPILER_OPTIMIZATION_MODE_PRECISE
+} aml_compiler_optimization_mode_t;
+
+typedef struct __aml_compiler_input_t
+{
+    aml_model_type_t model_type;
+    aml_model_in_out_type_t input_type;
+
+    const void* model_data;
+    size_t model_size;
+
+    const char* model_path;
+} aml_compiler_input_t;
+
+typedef struct __aml_compiler_config_t
+{
+    const char* hw_version;
+    int32_t axi_sram_size;
+    int32_t batch_multiplier;
+    aml_compiler_optimization_mode_t optimization_mode;
+} aml_compiler_config_t;
+
+typedef struct __aml_compiler_allocator_t
+{
+    uint8_t* (*allocate)(size_t size);
+    void (*deallocate)(uint8_t *p, size_t size);
+} aml_compiler_allocator_t;
+
+typedef struct __aml_compiler_metadata_t
+{
+    int32_t count;
+    const char** names;
+    const char** data;
+} aml_compiler_metadata_t;
+
+typedef struct __aml_compiler_debug_options_t
+{
+    bool dump_model_info;
+    bool disable_fusion;
+    bool disable_compression;
+    bool disable_memory_optimization;
+} aml_compiler_debug_options_t;
+
+typedef struct __aml_compiler_transformer_config
+{
+    uint64_t eos_token;
+    int32_t max_sequence_length;
+    int32_t freq_base;
+    float factor;
+    int32_t low_freq_factor;
+    int32_t high_freq_factor;
+    int32_t max_position_embeddings;
+} aml_compiler_transformer_config;
+
+typedef struct __aml_compiler_args_t
+{
+    // int32_t batch_multiplier;
+    int32_t compiler_only;
+    int32_t set_compiler_args_flag;
+    aml_compiler_input_t input;
+    aml_compiler_config_t config;
+    const aml_compiler_allocator_t* allocator;
+    const aml_compiler_metadata_t* metadata;
+    const aml_compiler_debug_options_t* debug_options;
+    const aml_compiler_transformer_config* transformer_config;
+    const char* custom_option_path;
+} aml_compiler_args_t;
+
+typedef enum __aml_hw_flag_t
+{
+    AML_HW_NPU     = 1,
+    AML_HW_GPU     = 2,
+    AML_HW_CPU     = 3
+} aml_hw_flag_t;
+
+typedef struct __aml_nn_config
+{
+    bool secure_config;
+    int typeSize;
+    int length;
+    const char *path;
+    const char *pdata;
+    amlnn_model_type modelType;
+    amlnn_nbg_type nbgType;
+    assign_user_address_t inOut;
+    aml_forward_ctrl_t forward_ctrl;
+    aml_compiler_args_t compiler_args;
+    aml_hw_flag_t hw_flag;
+    int timeout_ms;
+    const char *on_path;
+} aml_config;
+
+typedef struct {
+    unsigned int dim_count;       /*dim count*/
+    unsigned int sizes_of_dim[MAX_TENSOR_NUM_DIMS]; /*dim value,just support 4-d dim*/
+    unsigned int data_format;     /*see as nn_buffer_format_e*/
+    unsigned int data_type;       /*not use*/
+    unsigned int quantization_format; /*see as nn_buffer_quantize_format_e*/
+    int fixed_point_pos;          /*for int8/int16 QUANTIZE_DYNAMIC_FIXED_POINT*/
+    float TF_scale;               /*as tf define,scale*/
+    int TF_zeropoint;             /*as tf define,zeropoint*/
+    char name[MAX_NAME_LENGTH];    /*not use,will used in future*/
+} info_t;
+
+typedef struct {
+    unsigned int valid;
+    unsigned int num;
+    info_t *info;
+} tensor_info;
+
+typedef enum {
+    AML_INPUT_TENSOR      = 0,
+    AML_OUTPUT_TENSOR     = 1,
+} aml_flush_type_t;
+
+typedef enum {
+    AML_HARDWARE_INVALID             = -1,
+    AML_HARDWARE_VSI_UNIFY           = 0,
+    AML_HARDWARE_VIPLITE             = 1,
+    AML_HARDWARE_ADLA                = 2,
+    AML_HARDWARE_DELEGATE            = 3,
+    AML_HARDWARE_MAX                 = 100,
+} aml_hw_type_t;
+
+typedef struct {
+    unsigned int core_num;
+    unsigned int num;
+    float flops;  // TOps
+} aml_npu_hw_info_t;
+
+typedef struct {
+    char hw_version[10];
+    int i8_mac_cnt;
+    int max_clk;
+    int Gops;
+    bool kernel_vlc;
+    bool feature_vlc;
+    int cur_clk;
+    unsigned long sram_base;
+    int sram_size;
+} aml_hw_info_t;
+
+typedef struct {
+    unsigned int         cpuinfo;
+    char*                sdk_version;
+    char*                ddk_version;
+    char*                hw_version;
+    aml_hw_type_t        hw_type;
+    aml_npu_hw_info_t    npu_hw_info;
+    unsigned int         platform_type;
+    aml_hw_info_t        hw_info;
+} aml_platform_info_t;
+
+typedef struct {
+    int64_t memory_size;    // memory size used by the loaded model
+    int32_t axi_sram_size;  // axi sram size required by the loaded model
+    int32_t num_layers;
+    int64_t macc_count;
+    double  sum_macc; //Gops
+} aml_ctx_info_t;
+
+typedef struct {
+    int32_t working_frequency;      // in MHz
+    int32_t axi_working_frequency;  // in MHz
+    int64_t memory_size;            // memory size used by the device driver
+    int32_t axi_sram_size;          // axi sram size allocated for the device
+    float tops;                     // TOPS
+} aml_dev_memory_info_t;
+
+typedef struct {
+    aml_ctx_info_t               ctx_info;
+    aml_dev_memory_info_t        dev_info;
+} aml_context_info_t;
+
+typedef struct aml_profiling_ext_data
+{
+    uint64_t axi_freq_cur;   // adla axi clock frequency currently
+    uint64_t core_freq_cur;  // adla core clock frequency currently
+    uint64_t mem_alloced_base;
+    uint64_t mem_alloced_umd;
+    int64_t  mem_pool_size;  //-1:the limit base on the system
+    uint64_t mem_pool_used;
+    int32_t us_elapsed_in_fixup_cmq;
+    int32_t us_elapsed_in_hw_op;
+    int32_t us_elapsed_in_sw_op;
+    int32_t invoke_has_error;
+} aml_profiling_ext_data_t;
+
+typedef struct aml_profiling_data
+{
+    uint64_t inference_time_us;
+    uint64_t memory_usage_bytes;
+    uint64_t dram_read_bytes;
+    uint64_t dram_write_bytes;
+    uint64_t sram_read_bytes;
+    uint64_t sram_write_bytes;
+    aml_profiling_ext_data_t ext;
+} aml_profiling_data_t;
+
+typedef struct aml_work_frequency
+{
+    float                         *curr_freq;
+    float                         *min_freq;
+} aml_work_frequency_t;
+
+typedef struct aml_clk_core
+{
+    int32_t                        set_clk_core_switch;
+    int32_t                        clk;
+} aml_clk_core_t;
+
+typedef struct aml_dpm_period
+{
+    int32_t                        set_dpm_period_switch;
+    int32_t                        dpm;
+} aml_dpm_period_t;
+
+typedef struct {
+    unsigned int                   suspend_time;
+    float                          freq;
+    aml_profile_type_t             profile_type;
+    aml_policy_type_t              policy_type;
+    aml_profiling_data_t           profiling_data;
+    aml_work_frequency_t           frequency_data;
+    aml_context_info_t             context_info;
+    aml_clk_core_t                 clk_core;
+    aml_dpm_period_t               dpm_period;
+} aml_profile_config_t;
+
+typedef enum {
+    AML_WITH_MMU              = 0,
+    AML_WITHOUT_MMU           = 1,
+} aml_mmu_type_t;
+
+typedef enum {
+    AML_VIRTUAL_ADDR              = 0,
+    AML_PHYS_ADDR                 = 1,
+    AML_VIRTUAL_SECURE_ADDR       = 2,
+    AML_PHYS_SECURE_ADDR          = 3
+} aml_memory_type_t;
+
+typedef  struct __aml_memory_data_t
+{
+    uint32_t                typeSize;
+    void                   *memory;
+    void                   *viraddr;
+    uint64_t                phyaddr;
+} aml_memory_data_t;
+
+typedef enum {
+    AML_MEM_DIRECTION_READ_WRITE  = 0,
+    AML_MEM_DIRECTION_READ_ONLY   = 1,
+    AML_MEM_DIRECTION_WRITE_ONLY  = 2,
+} aml_mem_direction_t;
+
+typedef  struct __aml_memory_config_t
+{
+    uint32_t               typeSize;
+    int64_t                mem_size;
+    uint32_t               index;
+    aml_mmu_type_t         mmu_type;
+    aml_cache_type_t       cache_type;
+    aml_memory_type_t      memory_type;
+    aml_mem_direction_t    direction;
+} aml_memory_config_t;
+
+
+/*=============================================================
+                     NNSDK main api
+==============================================================*/
+/**
+ * @brief Create aml network module
+ * @param config    Module configuration
+ * @return void*    Context handle
+ */
+void* aml_module_create(aml_config* config);
+
+/**
+ * @brief Set network input
+ * @param context   Context handle
+ * @param pInput    Input structure
+ * @return int      Status code
+ */
+int aml_module_input_set(void* context, nn_input *pInput);
+
+/**
+ * @brief Run and get output
+ * @param context   Context handle
+ * @param outconfig Output configuration
+ * @return void*    Output handle or data
+ */
+void* aml_module_output_get(void* context, aml_output_config_t outconfig);
+
+/**
+ * @brief Run and get output simply, for custom network
+ * @param context   Context handle
+ * @return void*    Output handle
+ */
+void* aml_module_output_get_simple(void* context);
+
+/**
+ * @brief Destroy network environment, free the alloced buffer
+ * @param context   Context handle
+ * @return int      Status code
+ */
+int aml_module_destroy(void* context);
+
+/*============================================================
+             NNSDK debug api
+==============================================================*/
+/**
+ * @brief Set profile type
+ * @param type      Profile type
+ * @param savepath  Path to save profile data
+ * @return int      Status code
+ */
+int aml_util_setProfile(aml_profile_type_t type, const char *savepath);
+
+/**
+ * @brief Set power policy
+ * @param type      Power policy type
+ * @return int      Status code
+ */
+int aml_util_setPowerPolicy(aml_policy_type_t type);
+
+/**
+ * @brief Get hardware info
+ * @param customID    Pointer to custom ID
+ * @param powerStatus Pointer to power status
+ * @param version     Pointer to version
+ * @return int        Status code
+ */
+int aml_util_getHardwareStatus(int* customID, int *powerStatus, int* version);
+
+/**
+ * @brief Set auto suspend timeout
+ * @param timeout   Timeout in milliseconds
+ * @return int      Status code
+ */
+int aml_util_setAutoSuspend(int timeout);
+
+/**
+ * @brief Enable profiling
+ * @param context       Context handle
+ * @param profile_data  Pointer to profile configuration
+ * @return int          Status code
+ */
+int aml_util_enableProfile(void *context, aml_profile_config_t* profile_data);
+
+/**
+ * @brief Get profile info
+ * @param context       Context handle
+ * @param profile_data  Pointer to profile configuration
+ * @return int          Status code
+ */
+int aml_util_getProfileInfo(void *context, aml_profile_config_t* profile_data);
+
+/**
+ * @brief Disable profiling
+ * @param context       Context handle
+ * @param profile_data  Pointer to profile configuration
+ * @return int          Status code
+ */
+int aml_util_disableProfile(void *context, aml_profile_config_t* profile_data);
+
+/**
+ * @brief Read chip info
+ * @param platform_info Pointer to platform info structure
+ * @return int          Status code
+ */
+int aml_read_chip_info(aml_platform_info_t* platform_info);
+
+/*============================================================
+             NNSDK dma api
+==============================================================*/
+/**
+ * @brief Malloc 4k align buffer for dma IO
+ * @param context    Context handle
+ * @param mem_size   Size of memory
+ * @param mem_config Memory configuration
+ * @return unsigned char* Pointer to allocated buffer
+ */
+unsigned char *aml_util_mallocAlignedBuffer(void* context, int mem_size, aml_memory_config_t* mem_config);
+
+/**
+ * @brief Free buffer alloced by aml_util_mallocAlignedBuffer
+ * @param context    Context handle
+ * @param addr       Address to free
+ */
+void aml_util_freeAlignedBuffer(void* context, unsigned char *addr);
+
+/**
+ * @brief Swap input buffer. The inputId (for multi-number input) is ordered as aml_util_getInputTensorInfo array
+ * @param context    Context handle
+ * @param newBuffer  Pointer to new buffer
+ * @param inputId    Index of input
+ * @return int       Status code
+ */
+int aml_util_swapInputBuffer(void *context, void *newBuffer, unsigned int inputId);
+
+/**
+ * @brief Swap output buffer
+ * @param context    Context handle
+ * @param newBuffer  Pointer to new buffer
+ * @param outputId   Index of output
+ * @return int       Status code
+ */
+int aml_util_swapOutputBuffer(void *context, void *newBuffer, unsigned int outputId);
+
+/**
+ * @brief Switch input buffer
+ * @param context    Context handle
+ * @param newBuffer  Pointer to new buffer
+ * @param inputId    Index of input
+ * @return int       Status code
+ */
+int aml_util_switchInputBuffer(void *context, void *newBuffer, unsigned int inputId);
+
+/**
+ * @brief Switch output buffer
+ * @param context    Context handle
+ * @param newBuffer  Pointer to new buffer
+ * @param outputId   Index of output
+ * @return int       Status code
+ */
+int aml_util_switchOutputBuffer(void *context, void *newBuffer, unsigned int outputId);
+
+/**
+ * @brief Flush tensor cache memory
+ * @param context    Context handle
+ * @param type       Flush type
+ * @return int       Status code
+ */
+int aml_util_flushTensorHandle(void *context, aml_flush_type_t type);
+
+/**
+ * @brief Get model input tensor information list
+ * @param adladata   Pointer to adla data
+ * @return tensor_info* Pointer to tensor info structure
+ */
+tensor_info* aml_util_getInputTensorInfo(const char* adladata);
+
+/**
+ * @brief Get model output tensor information list
+ * @param adladata   Pointer to adla data
+ * @return tensor_info* Pointer to tensor info structure
+ */
+tensor_info* aml_util_getOutputTensorInfo(const char* adladata);
+
+/**
+ * @brief Malloc buffer
+ * @param context    Context handle
+ * @param mem_config Memory configuration
+ * @param mem_data   Memory data structure to fill
+ * @return int       Status code
+ */
+int aml_util_mallocBuffer(void* context, aml_memory_config_t* mem_config, aml_memory_data_t* mem_data);
+
+/**
+ * @brief Free buffer
+ * @param context    Context handle
+ * @param mem_config Memory configuration
+ * @param mem_data   Memory data structure
+ * @return int       Status code
+ */
+int aml_util_freeBuffer(void* context, aml_memory_config_t* mem_config, aml_memory_data_t* mem_data);
+
+/**
+ * @brief Flush buffer
+ * @param context    Context handle
+ * @param mem_config Memory configuration
+ * @param mem_data   Memory data structure
+ * @return int       Status code
+ */
+int aml_util_flushBuffer(void* context, aml_memory_config_t* mem_config, aml_memory_data_t* mem_data);
+
+/**
+ * @brief Swap external input buffer
+ * @param context    Context handle
+ * @param mem_config Memory configuration
+ * @param mem_data   Memory data structure
+ * @return int       Status code
+ */
+int aml_util_swapExternalInputBuffer(void *context, aml_memory_config_t* mem_config, aml_memory_data_t* mem_data);
+
+/**
+ * @brief Swap external output buffer
+ * @param context    Context handle
+ * @param mem_config Memory configuration
+ * @param mem_data   Memory data structure
+ * @return int       Status code
+ */
+int aml_util_swapExternalOutputBuffer(void *context, aml_memory_config_t* mem_config, aml_memory_data_t* mem_data);
+
+/**
+ * @brief Get tensor info
+ * @param context    Context handle
+ * @param model_data Model data
+ * @param in_tInfo   Pointer to input tensor info pointer
+ * @param out_tInfo  Pointer to output tensor info pointer
+ * @return int       Status code
+ */
+int aml_util_getTensorInfo(void *context, const char* model_data, tensor_info** in_tInfo, tensor_info** out_tInfo);
+
+/**
+ * @brief Free the tensor_info memory
+ * @param tinfo      Pointer to tensor_info
+ * @return int       Status code
+ */
+int aml_util_freeTensorInfo(tensor_info* tinfo);
+
+#ifdef __cplusplus
+} //extern "C"
+#endif
+
+#endif // _NN_SDK_H
\ No newline at end of file
diff --git a/dependency/nnsdk/lib/android/arm64-v8a/libjpeg.so b/dependency/nnsdk/lib/android/arm64-v8a/libjpeg.so
new file mode 100755
index 0000000..0d80399
Binary files /dev/null and b/dependency/nnsdk/lib/android/arm64-v8a/libjpeg.so differ
diff --git a/dependency/nnsdk/lib/android/arm64-v8a/libjpeg_t.so b/dependency/nnsdk/lib/android/arm64-v8a/libjpeg_t.so
new file mode 100755
index 0000000..0d80399
Binary files /dev/null and b/dependency/nnsdk/lib/android/arm64-v8a/libjpeg_t.so differ
diff --git a/dependency/nnsdk/lib/android/arm64-v8a/libnnsdk.so b/dependency/nnsdk/lib/android/arm64-v8a/libnnsdk.so
new file mode 100755
index 0000000..20ff9c9
Binary files /dev/null and b/dependency/nnsdk/lib/android/arm64-v8a/libnnsdk.so differ
diff --git a/dependency/nnsdk/lib/android/armeabi-v7a/libjpeg.so b/dependency/nnsdk/lib/android/armeabi-v7a/libjpeg.so
new file mode 100755
index 0000000..f1e4a0f
Binary files /dev/null and b/dependency/nnsdk/lib/android/armeabi-v7a/libjpeg.so differ
diff --git a/dependency/nnsdk/lib/android/armeabi-v7a/libjpeg_t.so b/dependency/nnsdk/lib/android/armeabi-v7a/libjpeg_t.so
new file mode 100755
index 0000000..f1e4a0f
Binary files /dev/null and b/dependency/nnsdk/lib/android/armeabi-v7a/libjpeg_t.so differ
diff --git a/dependency/nnsdk/lib/android/armeabi-v7a/libnnsdk.so b/dependency/nnsdk/lib/android/armeabi-v7a/libnnsdk.so
new file mode 100755
index 0000000..0eda7e7
Binary files /dev/null and b/dependency/nnsdk/lib/android/armeabi-v7a/libnnsdk.so differ
diff --git a/dependency/nnsdk/lib/linux/libnnsdk.so b/dependency/nnsdk/lib/linux/libnnsdk.so
new file mode 100755
index 0000000..ea271e7
Binary files /dev/null and b/dependency/nnsdk/lib/linux/libnnsdk.so differ
diff --git a/examples/LLMs/README.md b/examples/LLMs/README.md
index e625c95..c3bf6b0 100644
--- a/examples/LLMs/README.md
+++ b/examples/LLMs/README.md
@@ -92,6 +92,18 @@ To compile the CPP project using Android NDK, follow these steps:
 
 ### Python
 
+**System Requirements**:
+- OS: Ubuntu 22.04
+- Python: 3.10
+
+**Verify NPU Driver Version**:
+Execute the following commands in the serial console to check the NPU driver version:
+```bash
+dmesg | grep adla
+strings /usr/lib/libadla.so | grep LIBADLA
+```
+The driver version must be 1.7.x or higher.
+
 1. **Install Dependencies**:
    Ensure the`amlllm`Python package is installed:
    ```bash