From a7b611bc933060cf2c051051cf5001c8f30cac36 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 16:52:29 +0800 Subject: [PATCH] Minor updates for raising PR --- CMakePresets.json | 20 -------------------- docs/build.md | 21 +++------------------ ggml/src/ggml-openvino/ggml-decoder.cpp | 3 +-- 3 files changed, 4 insertions(+), 40 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 392c357f37..b5afeb3c0f 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,26 +1,6 @@ { "version": 4, "configurePresets": [ - { - "name": "ReleaseOV", - "generator": "Ninja", - "binaryDir": "${sourceDir}/build/${presetName}", - "installDir": "${sourceDir}/build/install/${presetName}", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release", - "GGML_OPENVINO": true, - "OpenVINO_DIR": "$env{OPENVINO_LLAMA_PATH}/build/Release" - } - }, - { - "name": "ReleaseCPU", - "generator": "Ninja", - "binaryDir": "${sourceDir}/build/${presetName}", - "installDir": "${sourceDir}/build/install/${presetName}", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release" - } - }, { "name": "base", "hidden": true, diff --git a/docs/build.md b/docs/build.md index 1424a06508..9e44f18eae 100644 --- a/docs/build.md +++ b/docs/build.md @@ -698,7 +698,7 @@ To read documentation for how to build on IBM Z & LinuxONE, [click here](./build ## OpenVINO -[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. +[OpenVINO](https://docs.openvino.ai/2025/index.html) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. The OpenVINO backend enhances performance by leveraging hardware-specific optimizations and can be enabled for use with llama.cpp. Follow the instructions below to install OpenVINO runtime and build llama.cpp with OpenVINO support. @@ -800,9 +800,8 @@ export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache Control OpenVINO behavior using these environment variables: -- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. -- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. -- **`GGML_OPENVINO_WEIGHT_AS_INPUT`**: Pass the weights as input to the OpenVINO model instead of creating Constant nodes for them. +- **`GGML_OPENVINO_DEVICE`**: Specify the target device for OpenVINO inference. If not set, automatically selects the first available device in priority order: GPU, CPU, NPU. When set to `NPU` to use Intel NPUs, it enables static compilation mode for optimal performance. +- **`GGML_OPENVINO_CACHE_DIR`**: Directory for model caching (recommended: `/tmp/ov_cache`). If set, enables model caching in OpenVINO. Note: Not supported when using NPU devices yet. - **`GGML_OPENVINO_PROFILING`**: Enable execution time profiling. - **`GGML_OPENVINO_DUMP_CGRAPH`**: Save compute graph to `cgraph.txt`. - **`GGML_OPENVINO_DUMP_IR`**: Export OpenVINO IR files with timestamps. @@ -817,20 +816,6 @@ export GGML_OPENVINO_PROFILING=1 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " ``` -> **Note:** To apply your code changes, clear the `GGML_OPENVINO_CACHE_DIR` directory and rebuild the project. - -### Using Llama.cpp's Built-in CPU Backend (for Comparison) - -To compare performance with the default CPU backend: - -```bash -# Build CPU-only version -cmake --preset ReleaseCPU -cmake --build build/ReleaseCPU --parallel - -# Run with the default CPU backend -./build/ReleaseCPU/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct.fp16.gguf -n 50 "The story of AI is " -``` ## Notes about GPU-accelerated backends diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 6bc2c253e8..09919c8505 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -57,8 +57,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_cgraph* cgraph, } if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - auto timestamp = (long long) ggml_time_us(); - std::string filename = "cgraph_" + std::to_string(timestamp) + ".txt"; + std::string filename = "cgraph.txt"; dump_cgraph(cgraph, filename); }