From d9ad2db4d7aa350c6541dbde92a311b8b2c2f168 Mon Sep 17 00:00:00 2001 From: "dian.yuan" Date: Thu, 15 Jan 2026 15:13:39 +0800 Subject: [PATCH] upload llm python demo --- examples/LLMs/README.md | 136 +++++++++++++++++++++++++++ examples/LLMs/py/simple_chat.py | 159 ++++++++++++++++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 examples/LLMs/README.md create mode 100755 examples/LLMs/py/simple_chat.py diff --git a/examples/LLMs/README.md b/examples/LLMs/README.md new file mode 100644 index 0000000..71a0317 --- /dev/null +++ b/examples/LLMs/README.md @@ -0,0 +1,136 @@ +# LLM Examples + +## Resource Requirements + +| Model | CPU | NPU | GPU | +| :--- | :--- | :--- | :--- | +| Qwen(0.5B) | Minimum cores: 4
DDR: 4G (2G reserved for NN) | At least 3.2T | NO | +| Qwen(1.8B) | Minimum cores: 4
DDR: 8G (6G~6.5G reserved for NN) | At least 3.2T | NO | +| Gemma(2B) | Minimum cores: 4
DDR: 8G (5.5G~6G reserved for NN) | At least 3.2T | NO | + + + ## Performance + +ADLA2: A311D2_3.2T / S905X5_4T + +| LLM Model | SOC | Dtype | Seqlen | Max_Context | New_Tokens | TTFT(ms) | Tokens/s | memory(G) | +| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | +| DeepSeek-R1 | A311D2 | w8a8 | 64 | 320 | 256 | 927.79 | 4.95 | 1.99 | +| DeepSeek-R1 | S905X5 | w8a8 | 64 | 320 | 256 | 514.86 | 4.47 | 1.73 | +| Gemma-2B | A311D2 | w8a8 | 64 | 320 | 256 | 846.66 | 2.64 | 3.93 | +| Gemma-2B | S905X5 | w8a8 | 64 | 320 | 256 | 482.92 | 3.08 | 2.77 | +| Gemma-3-1B | A311D2 | w8a8 | 64 | 320 | 256 | 702.88 | 5.08 | 1.9 | +| Gemma-3-1B | S905X5 | w8a8 | 64 | 320 | 256 | 468.97 | 6.44 | 1.38 | +| Llama3.2_1B | A311D2 | w8a8 | 64 | 320 | 256 | 711.64 | 5.92 | 1.69 | +| Llama3.2_1B | S905X5 | w8a8 | 64 | 320 | 256 | 695.92 | 5.42 | 1.5 | +| Qwen1.5_1.8B | A311D2 | w8a8 | 64 | 320 | 256 | 794.50 | 4.52 | 2.2 | +| Qwen1.5_1.8B | S905X5 | w8a8 | 64 | 320 | 256 | 983.93 | 4.47 | 1.9 | +| Qwen2.5_0.5B | A311D2 | w8a8 | 64 | 320 | 256 | 400.44 | 10.50 | 0.88 | +| Qwen2.5_0.5B | S905X5 | w8a8 | 64 | 320 | 256 | 400.37 | 10.97 | 0.66 | +| Qwen2.5_1.5B | A311D2 | w8a8 | 64 | 320 | 256 | 882.49 | 3.94 | 2.37 | +| Qwen2.5_1.5B | S905X5 | w8a8 | 64 | 320 | 256 | 874.06 | 4.16 | 1.76 | +| TinyLlama-1.1B-Chat-v1.0 | A311D2 | w8a8 | 64 | 320 | 256 | 763.07 | 6.51 | 1.31 | +| TinyLlama-1.1B-Chat-v1.0 | S905X5 | w8a8 | 64 | 320 | 256 | 1161.82 | 5.85 | 1.15 | +| TinyLlama-1.1B-Chat-v0.4 | A311D2 | w8a8 | 64 | 320 | 256 | 740.02 | 6.38 | 1.31 | +| TinyLlama-1.1B-Chat-v0.4 | S905X5 | w8a8 | 64 | 320 | 256 | 733.01 | 6.28 | 1.11 | + + +## Compile + +### CPP +To compile the CPP project using Android NDK, follow these steps: + +1. **Get the llmsdk library and header files**: + Clone the `amlnn-toolkit` repository to get the necessary libraries for compilation. + ```bash + # Clone to the parent directory of amlnn-model-playground + git clone https://github.com/Amlogic-NN/amlnn-toolkit.git + ``` + +2. **Set the NDK path**: + ```bash + export NDK_PATH=/your/ndk/path/android-ndk-r25c + ``` + +3. **Add NDK to your PATH**: + ```bash + export PATH=$NDK_PATH:$PATH + ``` + +4. **Compile**: + Navigate to the `cpp` directory and run `build-android.sh`: + ```bash + cd examples/LLMs/cpp + ./build-android.sh + ``` + +5. **Run**: + Push the compiled executable, model, and tokenizer to your Android device. + + Optional configuration: + - **Push `llmsdk.so`**: If not already present on the device, push it to `/data/local/tmp`. + - **Set permissions**: + ```bash + chmod +x demo_llm_main + ``` + - **Set environment variable**: + ```bash + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/vendor/lib64/:/data/local/tmp + ``` + + Then execute: + ```bash + ./demo_llm_main Qwen2.5-1.5B-Instruct-F16_quant_i8_t7c.adla tokenizer.json + ``` + +### Python + +1. **Install Dependencies**: + Ensure the`amlllm`Python package is installed: + ```bash + pip install amlllm-1.0.0-cp310-cp310-linux_aarch64.whl + ``` + +2. **Run**: + Navigate to the`py`directory and run`simple_chat.py`: + ```bash + cd examples/LLMs/py + python simple_chat.py --model --tokenizer [options] + ``` + +3. **Parameters**: + - `--model`: (Required) Path to LLM model file + - `--tokenizer`: (Required) Path to tokenizer resources + - `--sampling-mode`: Sampling mode, options: `argmax`, `top_p`, `top_k`, default: `argmax` + - `--top-k`: Top-K parameter, default: 3 + - `--top-p`: Top-P parameter, default: 0.9 + - `--temperature`: Softmax temperature parameter, default: 1.0 + - `--repeat-penalty`: Repeat penalty factor, default: 1.1 + - `--loglevel`: Log level, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`, default: `ERROR` + - `--model-type`: Model type template, options: `none`, `qwen`, `deepseek`, `gemma`, `gemma3`, `llama`, `tiny_llama`, `tiny_llama_v0_4`, `phi_1_5`, `phi_2`, default: `none` + +4. **Usage Examples**: + ```bash + # Using Qwen model + python simple_chat.py --model Qwen2.5-1.5B-Instruct-F16_quant_i8_t7c.adla --tokenizer tokenizer.json --model-type qwen + + # Using Top-P sampling mode + python simple_chat.py --model model.adla --tokenizer tokenizer.json --sampling-mode top_p --top-p 0.9 --temperature 0.8 + + # Using Top-K sampling mode + python simple_chat.py --model model.adla --tokenizer tokenizer.json --sampling-mode top_k --top-k 5 + ``` + +5. **Interactive Commands**: + After the program starts, you enter an interactive interface that supports the following commands: + - Direct input: Enter text and press Enter, the model will generate a response (streaming output) + - `exit`: Exit the program + - `new_talk`: Clear conversation history and start a new conversation + - `break`: Interrupt the currently generating response + - `Ctrl+C`: Send interrupt signal + +## Result + +| Banner | Inference Result | +| :---: | :---: | +| ![llm-result0](./model/llm-result0.png) | ![llm-result](./model/llm_result.png) | \ No newline at end of file diff --git a/examples/LLMs/py/simple_chat.py b/examples/LLMs/py/simple_chat.py new file mode 100755 index 0000000..86c82b7 --- /dev/null +++ b/examples/LLMs/py/simple_chat.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +import argparse +import sys +from datetime import datetime + +from amlllm.api import AMLLLM +from amlllm.backend import RunStatus + + +def stream_callback(token, userdata=None): + """Print tokens as they arrive (mimic C demo callback behavior).""" + text = token.get("text", "") + status = token.get("status") + if userdata and not userdata.get("printed"): + print(f"[Request #{userdata.get('request_id', 0)}]") + userdata["printed"] = True + if status == RunStatus.FINISH: + print() + elif status == RunStatus.ERROR: + print("\n[Generation error]") + elif text: + print(text, end="", flush=True) + + +def apply_model_template(amlllm: AMLLLM, model_type: str): + """Set chat templates using the same defaults as the C demo.""" + system_prompt = "" + prompt_prefix = "" + prompt_postfix = "" + + if model_type == "qwen": + system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + prompt_prefix = "<|im_start|>user\n" + prompt_postfix = "<|im_end|>\n<|im_start|>assistant\n" + elif model_type == "deepseek": + system_prompt = "<|begin_of_sentence|>" + prompt_prefix = "<|User|>" + prompt_postfix = "<|Assistant|>please don't include tags in your answers\n" + elif model_type in ("gemma", "gemma3"): + system_prompt = "" + prompt_prefix = "user\n" + prompt_postfix = "\nmodel\n" + elif model_type == "llama": + date_str = datetime.now().strftime("%d %b %Y") + system_prompt = ( + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "Cutting Knowledge Date: December 2023\n" + f"Today Date: {date_str}\n\n" + "<|eot_id|>" + ) + prompt_prefix = "<|start_header_id|>user<|end_header_id|>\n\n" + prompt_postfix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + elif model_type == "tiny_llama": + system_prompt = "<|im_start|>system\nYou are a friendly chatbot.<|im_end|>\n" + prompt_prefix = "<|im_start|>user\n" + prompt_postfix = "<|im_end|>\n<|im_start|>assistant\n" + elif model_type == "tiny_llama_v0_4": + system_prompt = "" + prompt_prefix = "" + prompt_postfix = "" + elif model_type == "phi_1_5": + prompt_postfix = "\nAnswer:" + elif model_type == "phi_2": + prompt_prefix = "Instruct: " + prompt_postfix = "\nOutput:" + + if system_prompt or prompt_prefix or prompt_postfix: + amlllm.set_chat_template(system_prompt, prompt_prefix, prompt_postfix) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Amlogic LLM interactive demo (Python)") + parser.add_argument("--model", required=True, help="Path to LLM model file") + parser.add_argument("--tokenizer", required=True, help="Path to tokenizer resources") + parser.add_argument("--sampling-mode", default="argmax", choices=["argmax", "top_p", "top_k"], help="Sampling mode") + parser.add_argument("--top-k", type=int, default=3, dest="top_k", help="Top-K parameter") + parser.add_argument("--top-p", type=float, default=0.9, dest="top_p", help="Top-P parameter") + parser.add_argument("--temperature", type=float, default=1.0, help="Softmax temperature") + parser.add_argument("--repeat-penalty", type=float, default=1.1, dest="repeat_penalty", help="Repeat penalty factor") + parser.add_argument("--loglevel", default="ERROR", choices=["DEBUG", "INFO", "WARNING", "ERROR"]) + parser.add_argument("--model-type", default="none", dest="model_type", + choices=["none", "qwen", "deepseek", "gemma", "gemma3", "llama", "tiny_llama", "tiny_llama_v0_4", "phi_1_5", "phi_2"], + help="Optional builtin model template") + return parser.parse_args() + +def main(): + args = parse_args() + amlllm = AMLLLM() + amlllm.config( + model_path=args.model, + tokenizer_path=args.tokenizer, + sampling_mode=args.sampling_mode, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + repeat_penalty=args.repeat_penalty, + loglevel=args.loglevel, + on_token=stream_callback, + ) + amlllm.init() + + if args.model_type != "none": + apply_model_template(amlllm, args.model_type) + + print("Welcome to Amlogic LLM interactive demo (Python).") + print("Commands: exit | new_talk | break") + + user_state = {"request_id": 0, "printed": False} + + try: + while True: + try: + user_input = input("\nLLM@Amlogic>>> ").strip() + except EOFError: + print("\nExit") + break + + if not user_input: + print("Please enter a non-empty prompt.") + continue + + if user_input == "exit": + break + + if user_input == "new_talk": + amlllm.reset_session() + print("Conversation state cleared.") + continue + + if user_input == "break": + amlllm.break_generation() + print("Stop signal sent.") + continue + + try: + user_state["request_id"] += 1 + user_state["printed"] = False + result = amlllm.run( + prompt=user_input, + input_type="prompt", + run_mode="generate", + retain_history=False, + user_data=user_state, + ) + if not result["text"].endswith("\n"): + print() + print(f"Tokens generated: {result['token_count']}") + except KeyboardInterrupt: + print("\nKeyboardInterrupt received. Sending break...") + amlllm.break_generation() + except Exception as exc: + print(f"\nGeneration failed: {exc}") + finally: + amlllm.uninit() + + +if __name__ == "__main__": + sys.exit(main())