diff --git a/examples/LLMs/README.md b/examples/LLMs/README.md
new file mode 100644
index 0000000..71a0317
--- /dev/null
+++ b/examples/LLMs/README.md
@@ -0,0 +1,136 @@
+# LLM Examples
+
+## Resource Requirements
+
+| Model | CPU | NPU | GPU |
+| :--- | :--- | :--- | :--- |
+| Qwen(0.5B) | Minimum cores: 4
DDR: 4G (2G reserved for NN) | At least 3.2T | NO |
+| Qwen(1.8B) | Minimum cores: 4
DDR: 8G (6G~6.5G reserved for NN) | At least 3.2T | NO |
+| Gemma(2B) | Minimum cores: 4
DDR: 8G (5.5G~6G reserved for NN) | At least 3.2T | NO |
+
+
+ ## Performance
+
+ADLA2: A311D2_3.2T / S905X5_4T
+
+| LLM Model | SOC | Dtype | Seqlen | Max_Context | New_Tokens | TTFT(ms) | Tokens/s | memory(G) |
+| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
+| DeepSeek-R1 | A311D2 | w8a8 | 64 | 320 | 256 | 927.79 | 4.95 | 1.99 |
+| DeepSeek-R1 | S905X5 | w8a8 | 64 | 320 | 256 | 514.86 | 4.47 | 1.73 |
+| Gemma-2B | A311D2 | w8a8 | 64 | 320 | 256 | 846.66 | 2.64 | 3.93 |
+| Gemma-2B | S905X5 | w8a8 | 64 | 320 | 256 | 482.92 | 3.08 | 2.77 |
+| Gemma-3-1B | A311D2 | w8a8 | 64 | 320 | 256 | 702.88 | 5.08 | 1.9 |
+| Gemma-3-1B | S905X5 | w8a8 | 64 | 320 | 256 | 468.97 | 6.44 | 1.38 |
+| Llama3.2_1B | A311D2 | w8a8 | 64 | 320 | 256 | 711.64 | 5.92 | 1.69 |
+| Llama3.2_1B | S905X5 | w8a8 | 64 | 320 | 256 | 695.92 | 5.42 | 1.5 |
+| Qwen1.5_1.8B | A311D2 | w8a8 | 64 | 320 | 256 | 794.50 | 4.52 | 2.2 |
+| Qwen1.5_1.8B | S905X5 | w8a8 | 64 | 320 | 256 | 983.93 | 4.47 | 1.9 |
+| Qwen2.5_0.5B | A311D2 | w8a8 | 64 | 320 | 256 | 400.44 | 10.50 | 0.88 |
+| Qwen2.5_0.5B | S905X5 | w8a8 | 64 | 320 | 256 | 400.37 | 10.97 | 0.66 |
+| Qwen2.5_1.5B | A311D2 | w8a8 | 64 | 320 | 256 | 882.49 | 3.94 | 2.37 |
+| Qwen2.5_1.5B | S905X5 | w8a8 | 64 | 320 | 256 | 874.06 | 4.16 | 1.76 |
+| TinyLlama-1.1B-Chat-v1.0 | A311D2 | w8a8 | 64 | 320 | 256 | 763.07 | 6.51 | 1.31 |
+| TinyLlama-1.1B-Chat-v1.0 | S905X5 | w8a8 | 64 | 320 | 256 | 1161.82 | 5.85 | 1.15 |
+| TinyLlama-1.1B-Chat-v0.4 | A311D2 | w8a8 | 64 | 320 | 256 | 740.02 | 6.38 | 1.31 |
+| TinyLlama-1.1B-Chat-v0.4 | S905X5 | w8a8 | 64 | 320 | 256 | 733.01 | 6.28 | 1.11 |
+
+
+## Compile
+
+### CPP
+To compile the CPP project using Android NDK, follow these steps:
+
+1. **Get the llmsdk library and header files**:
+ Clone the `amlnn-toolkit` repository to get the necessary libraries for compilation.
+ ```bash
+ # Clone to the parent directory of amlnn-model-playground
+ git clone https://github.com/Amlogic-NN/amlnn-toolkit.git
+ ```
+
+2. **Set the NDK path**:
+ ```bash
+ export NDK_PATH=/your/ndk/path/android-ndk-r25c
+ ```
+
+3. **Add NDK to your PATH**:
+ ```bash
+ export PATH=$NDK_PATH:$PATH
+ ```
+
+4. **Compile**:
+ Navigate to the `cpp` directory and run `build-android.sh`:
+ ```bash
+ cd examples/LLMs/cpp
+ ./build-android.sh
+ ```
+
+5. **Run**:
+ Push the compiled executable, model, and tokenizer to your Android device.
+
+ Optional configuration:
+ - **Push `llmsdk.so`**: If not already present on the device, push it to `/data/local/tmp`.
+ - **Set permissions**:
+ ```bash
+ chmod +x demo_llm_main
+ ```
+ - **Set environment variable**:
+ ```bash
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/vendor/lib64/:/data/local/tmp
+ ```
+
+ Then execute:
+ ```bash
+ ./demo_llm_main Qwen2.5-1.5B-Instruct-F16_quant_i8_t7c.adla tokenizer.json
+ ```
+
+### Python
+
+1. **Install Dependencies**:
+ Ensure the`amlllm`Python package is installed:
+ ```bash
+ pip install amlllm-1.0.0-cp310-cp310-linux_aarch64.whl
+ ```
+
+2. **Run**:
+ Navigate to the`py`directory and run`simple_chat.py`:
+ ```bash
+ cd examples/LLMs/py
+ python simple_chat.py --model --tokenizer [options]
+ ```
+
+3. **Parameters**:
+ - `--model`: (Required) Path to LLM model file
+ - `--tokenizer`: (Required) Path to tokenizer resources
+ - `--sampling-mode`: Sampling mode, options: `argmax`, `top_p`, `top_k`, default: `argmax`
+ - `--top-k`: Top-K parameter, default: 3
+ - `--top-p`: Top-P parameter, default: 0.9
+ - `--temperature`: Softmax temperature parameter, default: 1.0
+ - `--repeat-penalty`: Repeat penalty factor, default: 1.1
+ - `--loglevel`: Log level, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`, default: `ERROR`
+ - `--model-type`: Model type template, options: `none`, `qwen`, `deepseek`, `gemma`, `gemma3`, `llama`, `tiny_llama`, `tiny_llama_v0_4`, `phi_1_5`, `phi_2`, default: `none`
+
+4. **Usage Examples**:
+ ```bash
+ # Using Qwen model
+ python simple_chat.py --model Qwen2.5-1.5B-Instruct-F16_quant_i8_t7c.adla --tokenizer tokenizer.json --model-type qwen
+
+ # Using Top-P sampling mode
+ python simple_chat.py --model model.adla --tokenizer tokenizer.json --sampling-mode top_p --top-p 0.9 --temperature 0.8
+
+ # Using Top-K sampling mode
+ python simple_chat.py --model model.adla --tokenizer tokenizer.json --sampling-mode top_k --top-k 5
+ ```
+
+5. **Interactive Commands**:
+ After the program starts, you enter an interactive interface that supports the following commands:
+ - Direct input: Enter text and press Enter, the model will generate a response (streaming output)
+ - `exit`: Exit the program
+ - `new_talk`: Clear conversation history and start a new conversation
+ - `break`: Interrupt the currently generating response
+ - `Ctrl+C`: Send interrupt signal
+
+## Result
+
+| Banner | Inference Result |
+| :---: | :---: |
+|  |  |
\ No newline at end of file
diff --git a/examples/LLMs/py/simple_chat.py b/examples/LLMs/py/simple_chat.py
new file mode 100755
index 0000000..86c82b7
--- /dev/null
+++ b/examples/LLMs/py/simple_chat.py
@@ -0,0 +1,159 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+from datetime import datetime
+
+from amlllm.api import AMLLLM
+from amlllm.backend import RunStatus
+
+
+def stream_callback(token, userdata=None):
+ """Print tokens as they arrive (mimic C demo callback behavior)."""
+ text = token.get("text", "")
+ status = token.get("status")
+ if userdata and not userdata.get("printed"):
+ print(f"[Request #{userdata.get('request_id', 0)}]")
+ userdata["printed"] = True
+ if status == RunStatus.FINISH:
+ print()
+ elif status == RunStatus.ERROR:
+ print("\n[Generation error]")
+ elif text:
+ print(text, end="", flush=True)
+
+
+def apply_model_template(amlllm: AMLLLM, model_type: str):
+ """Set chat templates using the same defaults as the C demo."""
+ system_prompt = ""
+ prompt_prefix = ""
+ prompt_postfix = ""
+
+ if model_type == "qwen":
+ system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+ prompt_prefix = "<|im_start|>user\n"
+ prompt_postfix = "<|im_end|>\n<|im_start|>assistant\n"
+ elif model_type == "deepseek":
+ system_prompt = "<|begin_of_sentence|>"
+ prompt_prefix = "<|User|>"
+ prompt_postfix = "<|Assistant|>please don't include tags in your answers\n"
+ elif model_type in ("gemma", "gemma3"):
+ system_prompt = ""
+ prompt_prefix = "user\n"
+ prompt_postfix = "\nmodel\n"
+ elif model_type == "llama":
+ date_str = datetime.now().strftime("%d %b %Y")
+ system_prompt = (
+ "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+ "Cutting Knowledge Date: December 2023\n"
+ f"Today Date: {date_str}\n\n"
+ "<|eot_id|>"
+ )
+ prompt_prefix = "<|start_header_id|>user<|end_header_id|>\n\n"
+ prompt_postfix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+ elif model_type == "tiny_llama":
+ system_prompt = "<|im_start|>system\nYou are a friendly chatbot.<|im_end|>\n"
+ prompt_prefix = "<|im_start|>user\n"
+ prompt_postfix = "<|im_end|>\n<|im_start|>assistant\n"
+ elif model_type == "tiny_llama_v0_4":
+ system_prompt = ""
+ prompt_prefix = ""
+ prompt_postfix = ""
+ elif model_type == "phi_1_5":
+ prompt_postfix = "\nAnswer:"
+ elif model_type == "phi_2":
+ prompt_prefix = "Instruct: "
+ prompt_postfix = "\nOutput:"
+
+ if system_prompt or prompt_prefix or prompt_postfix:
+ amlllm.set_chat_template(system_prompt, prompt_prefix, prompt_postfix)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Amlogic LLM interactive demo (Python)")
+ parser.add_argument("--model", required=True, help="Path to LLM model file")
+ parser.add_argument("--tokenizer", required=True, help="Path to tokenizer resources")
+ parser.add_argument("--sampling-mode", default="argmax", choices=["argmax", "top_p", "top_k"], help="Sampling mode")
+ parser.add_argument("--top-k", type=int, default=3, dest="top_k", help="Top-K parameter")
+ parser.add_argument("--top-p", type=float, default=0.9, dest="top_p", help="Top-P parameter")
+ parser.add_argument("--temperature", type=float, default=1.0, help="Softmax temperature")
+ parser.add_argument("--repeat-penalty", type=float, default=1.1, dest="repeat_penalty", help="Repeat penalty factor")
+ parser.add_argument("--loglevel", default="ERROR", choices=["DEBUG", "INFO", "WARNING", "ERROR"])
+ parser.add_argument("--model-type", default="none", dest="model_type",
+ choices=["none", "qwen", "deepseek", "gemma", "gemma3", "llama", "tiny_llama", "tiny_llama_v0_4", "phi_1_5", "phi_2"],
+ help="Optional builtin model template")
+ return parser.parse_args()
+
+def main():
+ args = parse_args()
+ amlllm = AMLLLM()
+ amlllm.config(
+ model_path=args.model,
+ tokenizer_path=args.tokenizer,
+ sampling_mode=args.sampling_mode,
+ top_k=args.top_k,
+ top_p=args.top_p,
+ temperature=args.temperature,
+ repeat_penalty=args.repeat_penalty,
+ loglevel=args.loglevel,
+ on_token=stream_callback,
+ )
+ amlllm.init()
+
+ if args.model_type != "none":
+ apply_model_template(amlllm, args.model_type)
+
+ print("Welcome to Amlogic LLM interactive demo (Python).")
+ print("Commands: exit | new_talk | break")
+
+ user_state = {"request_id": 0, "printed": False}
+
+ try:
+ while True:
+ try:
+ user_input = input("\nLLM@Amlogic>>> ").strip()
+ except EOFError:
+ print("\nExit")
+ break
+
+ if not user_input:
+ print("Please enter a non-empty prompt.")
+ continue
+
+ if user_input == "exit":
+ break
+
+ if user_input == "new_talk":
+ amlllm.reset_session()
+ print("Conversation state cleared.")
+ continue
+
+ if user_input == "break":
+ amlllm.break_generation()
+ print("Stop signal sent.")
+ continue
+
+ try:
+ user_state["request_id"] += 1
+ user_state["printed"] = False
+ result = amlllm.run(
+ prompt=user_input,
+ input_type="prompt",
+ run_mode="generate",
+ retain_history=False,
+ user_data=user_state,
+ )
+ if not result["text"].endswith("\n"):
+ print()
+ print(f"Tokens generated: {result['token_count']}")
+ except KeyboardInterrupt:
+ print("\nKeyboardInterrupt received. Sending break...")
+ amlllm.break_generation()
+ except Exception as exc:
+ print(f"\nGeneration failed: {exc}")
+ finally:
+ amlllm.uninit()
+
+
+if __name__ == "__main__":
+ sys.exit(main())