Merge branch 'master' into dev-refactoring

# Conflicts: # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt
2024-12-14 15:38:16 +08:00 · 2024-12-14 15:38:16 +08:00 · 6d3267aad1
parent e36ad89528 56eea0781c
commit 6d3267aad1
188 changed files with 22360 additions and 32028 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev

 WORKDIR /app

 COPY . .

-ENV LLAMA_CURL=1
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;

+FROM ubuntu:$UBUNTU_VERSION as runtime

-RUN make -j$(nproc)
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   /app/requirements.txt
+COPY requirements       /app/requirements
+COPY .devops/tools.sh   /app/tools.sh
+
+RUN pip install --upgrade pip setuptools wheel && \
+    pip install -r /app/requirements.txt
+
+COPY --from=build /app/build/bin/ /app/
+COPY --from=build /app/lib/ /app/
+COPY --from=build /app/convert_hf_to_gguf.py /app/
+COPY --from=build /app/gguf-py /app/gguf-py

 ENV LC_ALL=C.utf8

-ENTRYPOINT ["/app/.devops/tools.sh"]
+ENTRYPOINT ["/app/tools.sh"]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev

 WORKDIR /app

 COPY . .

-RUN make -j$(nproc) llama-cli
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
+    mkdir -p /app/lib && \
+    find build -name "*.so" -exec cp {} /app/lib/ \;

 FROM ubuntu:$UBUNTU_VERSION AS runtime

-RUN apt-get update && \
-    apt-get install -y libgomp1
+WORKDIR /app

-COPY --from=build /app/llama-cli /llama-cli
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/build/bin/llama-cli /app/
+COPY --from=build /app/lib/ /app/

 ENV LC_ALL=C.utf8

-ENTRYPOINT [ "/llama-cli" ]
+ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -9,28 +9,20 @@ WORKDIR /app

 COPY . .

-
-RUN \
-    # Build multiple versions of the CPU backend
-    scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \
-    scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \
-    scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \
-    scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \
-    # Build llama-server
-    cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
-    cmake --build build --target llama-server -j $(nproc) && \
-    # Copy the built libraries to /app/lib
+RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build build -j $(nproc) && \
    mkdir -p /app/lib && \
-    mv libggml-cpu* /app/lib/ && \
    find build -name "*.so" -exec cp {} /app/lib/ \;

 FROM ubuntu:$UBUNTU_VERSION AS runtime

+WORKDIR /app
+
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl

-COPY --from=build /app/build/bin/llama-server /llama-server
-COPY --from=build /app/lib/ /
+COPY --from=build /app/build/bin/llama-server /app/
+COPY --from=build /app/lib/ /app/

 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
@ -38,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0

 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

-ENTRYPOINT [ "/llama-server" ]
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -8,11 +8,11 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert_hf_to_gguf.py "$@"
+    exec python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./llama-quantize "$@"
+    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./llama-cli "$@"
+    exec ./llama-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./llama-server "$@"
+    exec ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -552,35 +552,44 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-# TODO: tmp disabled. see for possible re-enable:
-#       https://github.com/ggerganov/llama.cpp/pull/10525
-#  macOS-latest-swift:
-#    runs-on: macos-latest
-#
-#    strategy:
-#      matrix:
-#        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        id: depends
-#        continue-on-error: true
-#        run: |
-#          brew update
-#
-#      - name: xcodebuild for swift package
-#        id: xcodebuild
-#        run: |
-#          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-#
-#      - name: Build Swift Example
-#        id: make_build_swift_example
-#        run: |
-#            make swift
+  macOS-latest-swift:
+    runs-on: macos-latest
+
+    strategy:
+      matrix:
+        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build llama.cpp with CMake
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake -G Xcode .. \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+          sudo cmake --install . --config Release
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"

  windows-msys2:
    runs-on: windows-latest
@ -653,6 +662,8 @@ jobs:
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'llvm-arm64-opencl-adreno'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

    steps:
      - name: Clone
@ -694,6 +705,28 @@ jobs:
        run: |
          choco install ninja

+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          mkdir build && cd build
+          cmake .. `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build . --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          mkdir build-arm64-release && cd build-arm64-release
+          cmake .. `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build . --target install --config release
+
      - name: Build
        id: cmake_build
        run: |
@ -723,7 +756,7 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
@ -1104,6 +1137,29 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4

+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake -G Xcode .. \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=iOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+          sudo cmake --install . --config Release
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
+
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build

@ -1131,23 +1187,6 @@ jobs:

          ./gradlew build --no-daemon

-#  freeBSD-latest:
-#    runs-on: macos-12
-#    steps:
-#    - name: Clone
-#      uses: actions/checkout@v4
-#
-#    - name: Build
-#      uses: cross-platform-actions/action@v0.19.0
-#      with:
-#        operating_system: freebsd
-#        version: '13.2'
-#        hypervisor: 'qemu'
-#        run: |
-#            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
-#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
-
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -76,20 +76,26 @@ jobs:
        run: |
          pip install -r examples/server/tests/requirements.txt

-      - name: Verify server deps
-        id: verify_server_deps
+      # Setup nodejs (to be used for verifying bundled index.html)
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '22.11.0'
+
+      - name: Verify bundled index.html
+        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
-          cd examples/server
-          git ls-files --others --modified
+          cd examples/server/webui
          git status
-          ./deps.sh
+          npm ci
+          npm run build
          git status
-          not_ignored_files="$(git ls-files --others --modified)"
-          echo "Modified files: ${not_ignored_files}"
-          if [ -n "${not_ignored_files}" ]; then
-            echo "Repository is dirty or server deps are not built as expected"
-            echo "${not_ignored_files}"
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Repository is dirty or server/webui is not built as expected"
+            echo "Hint: You may need to follow Web UI build guide in server/README.md"
+            echo "${modified_files}"
            exit 1
          fi

--- a/.gitignore
+++ b/.gitignore
@ -104,6 +104,10 @@ examples/server/*.mjs.hpp
 !examples/sycl/*.bat
 !examples/sycl/*.sh

+# Server Web UI temporary files
+node_modules
+examples/server/webui/dist
+
 # Python

 /.venv
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -46,11 +46,9 @@ if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()

-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
-    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
+if (MSVC)
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
 endif()

 #
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -31,6 +31,13 @@
    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },

+    {
+        "name": "x64-windows-llvm", "hidden": true,
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
+        }
+    },
+
    {
        "name": "arm64-windows-msvc", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
@ -70,6 +77,11 @@
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },

+    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
+    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
+    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
+    { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
+
    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
--- a/4
+++ b/4
@ -1,3 +1,5 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs

-ci/ @ggerganov
+/ci/ @ggerganov
+/.devops/ @ngxson
+/examples/server/ @ngxson
--- a/41
+++ b/41
@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native

+	# Usage AMX build test
+	#MK_CFLAGS     += -march=graniterapids -mtune=graniterapids
+	#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
+
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
 	#MK_CXXFLAGS += -mfma -mf16c -mavx
@ -948,7 +952,6 @@ DIR_COMMON = common

 OBJ_GGML = \
 	$(DIR_GGML)/src/ggml.o \
-	$(DIR_GGML)/src/ggml-aarch64.o \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
@ -956,9 +959,11 @@ OBJ_GGML = \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
-	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
+	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
 	$(OBJ_GGML_EXT)

 OBJ_LLAMA = \
@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 # Default target
 all: $(BUILD_TARGETS)

+# force c++ build for source file that have same name as c file
 # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
-$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
-	ggml/src/ggml-cpu/ggml-cpu.cpp \
-	ggml/include/ggml-backend.h \
-	ggml/include/ggml.h \
-	ggml/include/ggml-alloc.h \
-	ggml/src/ggml-backend-impl.h \
-	ggml/include/ggml-cpu.h \
-	ggml/src/ggml-impl.h
-	$(CXX) $(CXXFLAGS)   -c $< -o $@
+$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@

 # Rules for building object files
 $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
@ -1145,8 +1143,15 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
 # Include dependency files
 -include $(DEP_FILES)

+# Clean generated server assets
+clean-server-assets:
+	find examples/server -type f -name "*.js.hpp"   -delete
+	find examples/server -type f -name "*.mjs.hpp"  -delete
+	find examples/server -type f -name "*.css.hpp"  -delete
+	find examples/server -type f -name "*.html.hpp" -delete
+
 # Clean rule
-clean:
+clean: clean-server-assets
 	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -rvf *.a *.dll *.so *.dot
 	find ggml src common tests examples pocs -type f -name "*.o" -delete
@ -1354,20 +1359,14 @@ llama-server: \
 	examples/server/utils.hpp \
 	examples/server/httplib.h \
 	examples/server/index.html.hpp \
-	examples/server/completion.js.hpp \
 	examples/server/loading.html.hpp \
-	examples/server/deps_daisyui.min.css.hpp \
-	examples/server/deps_markdown-it.js.hpp \
-	examples/server/deps_tailwindcss.js.hpp \
-	examples/server/deps_vue.esm-browser.js.hpp \
 	common/json.hpp \
-	common/stb_image.h \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

 # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% Makefile
+examples/server/%.hpp: examples/server/public/% FORCE Makefile
 	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
 		echo "unsigned char $${NAME}[] = {" && \
 		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@ -1542,7 +1541,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server
+.PHONY: FORCE main quantize perplexity embedding server

 # Define the object file target
 examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
--- a/Package.swift
+++ b/Package.swift
@ -2,59 +2,6 @@

 import PackageDescription

-var sources = [
-    "src/llama.cpp",
-    "src/llama-vocab.cpp",
-    "src/llama-grammar.cpp",
-    "src/llama-sampling.cpp",
-    "src/unicode.cpp",
-    "src/unicode-data.cpp",
-    "ggml/src/ggml.c",
-    "ggml/src/ggml-aarch64.c",
-    "ggml/src/ggml-alloc.c",
-    "ggml/src/ggml-backend.cpp",
-    "ggml/src/ggml-backend-reg.cpp",
-    "ggml/src/ggml-cpu/ggml-cpu.c",
-    "ggml/src/ggml-cpu/ggml-cpu.cpp",
-    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
-    "ggml/src/ggml-cpu/ggml-cpu-quants.c",
-    "ggml/src/ggml-threading.cpp",
-    "ggml/src/ggml-quants.c",
-]
-
-var resources: [Resource] = []
-var linkerSettings: [LinkerSetting] = []
-var cSettings: [CSetting] =  [
-    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-    .unsafeFlags(["-fno-objc-arc"]),
-    .headerSearchPath("ggml/src"),
-    .headerSearchPath("ggml/src/ggml-cpu"),
-    // NOTE: NEW_LAPACK will required iOS version 16.4+
-    // We should consider add this in the future when we drop support for iOS 14
-    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-    // .define("ACCELERATE_NEW_LAPACK"),
-    // .define("ACCELERATE_LAPACK_ILP64")
-    .define("GGML_USE_CPU"),
-]
-
-
-#if canImport(Darwin)
-sources.append("ggml/src/ggml-common.h")
-sources.append("ggml/src/ggml-metal/ggml-metal.m")
-resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
-linkerSettings.append(.linkedFramework("Accelerate"))
-cSettings.append(
-    contentsOf: [
-        .define("GGML_USE_ACCELERATE"),
-        .define("GGML_USE_METAL"),
-    ]
-)
-#endif
-
-#if os(Linux)
-    cSettings.append(.define("_GNU_SOURCE"))
-#endif
-
 let package = Package(
    name: "llama",
    platforms: [
@ -67,26 +14,6 @@ let package = Package(
        .library(name: "llama", targets: ["llama"]),
    ],
    targets: [
-        .target(
-            name: "llama",
-            path: ".",
-            exclude: [
-               "build",
-               "cmake",
-               "examples",
-               "scripts",
-               "models",
-               "tests",
-               "CMakeLists.txt",
-               "Makefile",
-               "ggml/src/ggml-metal-embed.metal"
-            ],
-            sources: sources,
-            resources: resources,
-            publicHeadersPath: "spm-headers",
-            cSettings: cSettings,
-            linkerSettings: linkerSettings
-        )
-    ],
-    cxxLanguageStandard: .cxx17
+        .systemLibrary(name: "llama", pkgConfig: "llama"),
+    ]
 )
--- a/README.md
+++ b/README.md
@ -433,6 +433,20 @@ To learn more about model quantization, [read this documentation](examples/quant

    </details>

+## [`llama-run`](examples/run)
+
+#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
+
+- <details>
+    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
+
+    ```bash
+    llama-run granite-code
+    ```
+
+    </details>
+
+[^3]: [https://github.com/containers/ramalama](RamaLama)

 ## [`llama-simple`](examples/simple)

--- a/Sources/llama/llama.h
+++ b/Sources/llama/llama.h
@ -0,0 +1,4 @@
+#pragma once
+
+#include <llama.h>
+
--- a/Sources/llama/module.modulemap
+++ b/Sources/llama/module.modulemap
@ -0,0 +1,5 @@
+module llama [system] {
+    header "llama.h"
+    link "llama"
+    export *
+}
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@ -6,5 +6,5 @@ includedir=${prefix}/include
 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
 Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lllama
+Libs: -L${libdir} -lggml  -lggml-base -lllama
 Cflags: -I${includedir}
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@ -0,0 +1,11 @@
+set( CMAKE_SYSTEM_NAME Windows )
+set( CMAKE_SYSTEM_PROCESSOR x86_64 )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( arch_c_flags "-march=native" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
+
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
 # Use curl to download model url
 if (LLAMA_CURL)
    find_package(CURL REQUIRED)
-    add_definitions(-DLLAMA_USE_CURL)
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    find_library(CURL_LIBRARY curl REQUIRED)
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
    }
 }

+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 //
 // CLI argument parsing functions
 //
@ -591,7 +620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.ctx_shift = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--chunks"}, "N",
        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -786,7 +815,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
    add_opt(common_arg(
        {"-ctk", "--cache-type-k"}, "TYPE",
-        string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        string_format(
+            "KV cache data type for K\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_k)
+        ),
        [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_k = value;
+            params.cache_type_k = kv_cache_type_from_str(value);
        }
    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
    add_opt(common_arg(
        {"-ctv", "--cache-type-v"}, "TYPE",
-        string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        string_format(
+            "KV cache data type for V\n"
+            "allowed values: %s\n"
+            "(default: %s)",
+            get_all_kv_cache_types().c_str(),
+            ggml_type_name(params.cache_type_v)
+        ),
        [](common_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_v = value;
+            params.cache_type_v = kv_cache_type_from_str(value);
        }
    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
    add_opt(common_arg(
@ -1711,6 +1750,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.public_path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+    add_opt(common_arg(
+        {"--no-webui"},
+        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.webui = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
    add_opt(common_arg(
        {"--embedding", "--embeddings"},
        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2076,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.speculative.n_max = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
    add_opt(common_arg(
        {"--draft-min", "--draft-n-min"}, "N",
        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
        [](common_params & params, int value) {
            params.speculative.n_min = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
    add_opt(common_arg(
        {"--draft-p-split"}, "P",
        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
        [](common_params & params, const std::string & value) {
            params.speculative.p_split = std::stof(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
    add_opt(common_arg(
        {"--draft-p-min"}, "P",
        string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
        [](common_params & params, const std::string & value) {
            params.speculative.p_min = std::stof(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
    add_opt(common_arg(
        {"-cd", "--ctx-size-draft"}, "N",
        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
        [](common_params & params, int value) {
            params.speculative.n_ctx = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
    add_opt(common_arg(
        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@ -2124,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
    add_opt(common_arg(
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.model = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));

    return ctx_arg;
 }
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    return mparams;
 }

-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "bf16") {
-        return GGML_TYPE_BF16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "iq4_nl") {
-        return GGML_TYPE_IQ4_NL;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Unsupported cache type: " + s);
-}
-
 struct llama_context_params common_context_params_to_llama(const common_params & params) {
    auto cparams = llama_context_default_params();

@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
    }

-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+    cparams.type_k = params.cache_type_k;
+    cparams.type_v = params.cache_type_v;

    return cparams;
 }
@ -1108,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2

-
-static bool starts_with(const std::string & str, const std::string & prefix) {
-    // While we wait for C++20's std::string::starts_with...
-    return str.rfind(prefix, 0) == 0;
-}
-
 static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
    int remaining_attempts = max_attempts;

--- a/common/common.h
+++ b/common/common.h
@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;

 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern const char * LLAMA_COMMIT;
+extern const char * LLAMA_COMPILER;
+extern const char * LLAMA_BUILD_TARGET;

 struct common_control_vector_load_info;

@ -215,7 +215,7 @@ struct common_params {
    struct common_params_speculative speculative;

    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_alias          = ""; // model alias                                                   // NOLINT
    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
@ -286,8 +286,8 @@ struct common_params {
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data

-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
@ -437,6 +437,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
    return parts;
 }

+static bool string_starts_with(const std::string & str,
+                               const std::string & prefix) {  // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init(
 }

 void common_speculative_free(struct common_speculative * spec) {
+    if (spec == nullptr) {
+        return;
+    }
+
    common_sampler_free(spec->smpl);

    llama_batch_free(spec->batch);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -658,6 +658,12 @@ class Model:
        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
            # ref: https://huggingface.co/facebook/chameleon-7b
            res = "chameleon"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"
+        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
+            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
+            res = "roberta-bpe"

        if res is None:
            logger.warning("\n")
@ -1831,29 +1837,40 @@ class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM

    def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
+        super().set_gguf_parameters()
+        embedding_scale = float(self.hparams["scale_emb"])
+        self.gguf_writer.add_embedding_scale(embedding_scale)
+        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
+        residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
+        self.gguf_writer.add_residual_scale(residual_scale)
+        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
+        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
+        self.gguf_writer.add_logit_scale(logit_scale)
+        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
+        if self.hparams.get("rope_scaling") is not None:
+            if self.hparams["rope_scaling"].get("type") == "longrope":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+                logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get('long_factor', None)
+            short_factors = rope_scaling.get('short_factor', None)
+
+            if long_factors is None or short_factors is None:
+                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))

    def set_vocab(self):
-        self._set_vocab_llama_hf()
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
+        self._set_vocab_sentencepiece()

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@ -1863,9 +1880,9 @@ class MiniCPMModel(Model):

        # HF models permute some of the tensors, so we need to undo that
        if name.endswith(("q_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)

        return [(self.map_tensor_name(name), data_torch)]

@ -1975,6 +1992,14 @@ class Qwen2Model(Model):
        except FileNotFoundError:
            self._set_vocab_gpt2()

+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+

@Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
@ -2519,7 +2544,7 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]


-@Model.register("BertModel", "CamembertModel")
+@Model.register("BertModel", "CamembertModel", "RobertaModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT

@ -2560,7 +2585,8 @@ class BertModel(Model):

        # we need this to validate the size of the token_type embeddings
        # though currently we are passing all zeros to the token_type embeddings
-        self.gguf_writer.add_token_type_count(2)  # "Sequence A" or "Sequence B"
+        # "Sequence A" or "Sequence B"
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))

        # convert to phantom space vocab
        def phantom(tok):
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -17,7 +17,7 @@
 #
 #   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
-# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
+# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
@ -102,6 +102,8 @@ models = [
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
+    {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
+    {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
 ]


--- a/docs/build.md
+++ b/docs/build.md
@ -26,19 +26,24 @@ cmake --build build --config Release

    1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):

-    ```bash
-    cmake -B build -DCMAKE_BUILD_TYPE=Debug
-    cmake --build build
-    ```
+       ```bash
+       cmake -B build -DCMAKE_BUILD_TYPE=Debug
+       cmake --build build
+       ```

    2. Multi-config generators (`-G` param set to Visual Studio, XCode...):

-    ```bash
-    cmake -B build -G "Xcode"
-    cmake --build build --config Debug
-    ```
+       ```bash
+       cmake -B build -G "Xcode"
+       cmake --build build --config Debug
+       ```

    For more details and a list of supported generators, see the [CMake documentation](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html).
+- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
+  ```
+  cmake -B build -DBUILD_SHARED_LIBS=OFF
+  cmake --build build --config Release
+  ```

 - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
    - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
@ -50,7 +55,14 @@ cmake --build build --config Release
    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
    cmake --build build-arm64-windows-llvm-release
    ```
-    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
+    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
+
+    For building with ninja generator and clang compiler as default:
+      -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
+      ```bash
+      cmake --preset x64-windows-llvm-release
+      cmake --build build-x64-windows-llvm-release
+      ```

 ## BLAS Build

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -20,7 +20,12 @@ else()
    add_subdirectory(batched)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
-    add_subdirectory(gbnf-validator)
+
+    if (NOT WIN32)
+        # disabled on Windows because it uses internal functions not exported with LLAMA_API
+        add_subdirectory(gbnf-validator)
+    endif()
+
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
    add_subdirectory(gguf)
@ -46,12 +51,16 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(tokenize)
+    add_subdirectory(gen-docs)
    if (NOT GGML_BACKEND_DL)
        # these examples use the backends directly and cannot be built with dynamic loading
        add_subdirectory(convert-llama2c-to-ggml)
        add_subdirectory(cvector-generator)
        add_subdirectory(export-lora)
-        add_subdirectory(quantize-stats)
+        if (NOT WIN32)
+            # disabled on Windows because it uses internal functions not exported with LLAMA_API
+            add_subdirectory(quantize-stats)
+        endif()
        add_subdirectory(llava)
        if (GGML_RPC)
            add_subdirectory(rpc)
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@ -12,7 +12,7 @@ int main(int argc, char** argv) {
    }

    // Get only the program name from the full path
-    auto pos = filename.find_last_of('/');
+    auto pos = filename.find_last_of("/\\");
    if (pos != std::string::npos) {
        filename = filename.substr(pos+1);
    }
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -287,7 +287,7 @@ struct split_strategy {
    }

    void print_info() {
-        printf("n_split: %ld\n", ctx_outs.size());
+        printf("n_split: %zu\n", ctx_outs.size());
        int i_split = 0;
        for (auto & ctx_out : ctx_outs) {
            // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@ -297,7 +297,7 @@ struct split_strategy {
                total_size += ggml_nbytes(t);
            }
            total_size = total_size / 1000 / 1000; // convert to megabytes
-            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
            i_split++;
        }
    }
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -14,7 +14,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
 -   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

 ## Input Prompts
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
    for (const auto & inst : params_instances) {
        params_idx++;
        if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
+            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
        }
        // keep the same model between tests when possible
        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
        // warmup run
        if (t.n_prompt > 0) {
            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
            }
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
        }
        if (t.n_gen > 0) {
            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
+                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
            }
            test_gen(ctx, 1, t.n_threads);
        }
@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {

            if (t.n_prompt > 0) {
                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
                            i + 1, params.reps);
                }
                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
                            i + 1, params.reps);
                }
                test_gen(ctx, t.n_gen, t.n_threads);
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -210,20 +210,20 @@ actor LlamaContext {

            llama_kv_cache_clear(context)

-            let t_pp_start = ggml_time_us()
+            let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;

            if llama_decode(context, batch) != 0 {
                print("llama_decode() failed during prompt")
            }
            llama_synchronize(context)

-            let t_pp_end = ggml_time_us()
+            let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000;

            // bench text generation

            llama_kv_cache_clear(context)

-            let t_tg_start = ggml_time_us()
+            let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;

            for i in 0..<tg {
                llama_batch_clear(&batch)
@ -238,7 +238,7 @@ actor LlamaContext {
                llama_synchronize(context)
            }

-            let t_tg_end = ggml_time_us()
+            let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;

            llama_kv_cache_clear(context)

--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@ -7,6 +7,7 @@
 	objects = {

 /* Begin PBXBuildFile section */
+		1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@ -17,7 +18,6 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
-		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
 		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */

@ -42,7 +42,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				DF810E132B4A5BA200301144 /* llama in Frameworks */,
+				1809696D2D05A39F00400EE8 /* llama in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
 			);
@ -151,7 +151,7 @@
 			);
 			name = llama.swiftui;
 			packageProductDependencies = (
-				DF810E122B4A5BA200301144 /* llama */,
+				1809696C2D05A39F00400EE8 /* llama */,
 			);
 			productName = llama.swiftui;
 			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@ -429,7 +429,7 @@
 /* End XCConfigurationList section */

 /* Begin XCSwiftPackageProductDependency section */
-		DF810E122B4A5BA200301144 /* llama */ = {
+		1809696C2D05A39F00400EE8 /* llama */ = {
 			isa = XCSwiftPackageProductDependency;
 			productName = llama;
 		};
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -12,6 +12,10 @@
 #include "ggml-cuda.h"
 #endif

+#ifdef GGML_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
@ -1169,6 +1173,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif

+#ifdef GGML_USE_SYCL
+    new_clip->backend = ggml_backend_sycl_init(0);
+    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+#endif
+
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
        LOG_INF("%s: CLIP using CPU backend\n", __func__);
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -66,7 +66,7 @@ In this section, we cover the most commonly used options for running the `llama-
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
 -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@ -131,7 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th

 ### Context Size

- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.

 ### Extended Context Size

@ -348,6 +348,7 @@ These options provide extra functionality and customization when running the LLa

 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
+-   `--no-display-prompt`: Don't print prompt at generation.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

-The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
-
 *(outdated)*

 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
@ -83,7 +81,7 @@ The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interle
  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
    }
-    LOG_INF("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %zu\n", chunks.size());

    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -1,5 +1,5 @@
 set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -3,5 +3,45 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.

 ```bash
-./llama-run Meta-Llama-3.1-8B-Instruct.gguf
+llama-run granite-code
+...
+
+```bash
+llama-run -h
+Description:
+  Runs a llm
+
+Usage:
+  llama-run [options] model [prompt]
+
+Options:
+  -c, --context-size <value>
+      Context size (default: 2048)
+  -n, --ngl <value>
+      Number of GPU layers (default: 0)
+  -h, --help
+      Show help message
+
+Commands:
+  model
+      Model is a string with an optional prefix of
+      huggingface:// (hf://), ollama://, https:// or file://.
+      If no protocol is specified and a file exists in the specified
+      path, file:// is assumed, otherwise if a file does not exist in
+      the specified path, ollama:// is assumed. Models that are being
+      pulled are downloaded with .partial extension while being
+      downloaded and then renamed as the file without the .partial
+      extension when complete.
+
+Examples:
+  llama-run llama3
+  llama-run ollama://granite-code
+  llama-run ollama://smollm:135m
+  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
+  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
+  llama-run https://example.com/some-file1.gguf
+  llama-run some-file2.gguf
+  llama-run file://some-file3.gguf
+  llama-run --ngl 99 some-file4.gguf
+  llama-run --ngl 99 some-file5.gguf Hello World
 ...
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -1,128 +1,350 @@
 #if defined(_WIN32)
-#include <windows.h>
+#    include <windows.h>
 #else
-#include <unistd.h>
+#    include <unistd.h>
 #endif

-#include <climits>
+#if defined(LLAMA_USE_CURL)
+#    include <curl/curl.h>
+#endif
+
+#include <cstdarg>
 #include <cstdio>
 #include <cstring>
+#include <filesystem>
 #include <iostream>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 #include <vector>

+#include "common.h"
+#include "json.hpp"
 #include "llama-cpp.h"

-typedef std::unique_ptr<char[]> char_array_ptr;
+#define printe(...)                   \
+    do {                              \
+        fprintf(stderr, __VA_ARGS__); \
+    } while (0)

-struct Argument {
-    std::string flag;
-    std::string help_text;
-};
+class Opt {
+  public:
+    int init(int argc, const char ** argv) {
+        construct_help_str_();
+        // Parse arguments
+        if (parse(argc, argv)) {
+            printe("Error: Failed to parse arguments.\n");
+            help();
+            return 1;
+        }

-struct Options {
-    std::string model_path, prompt_non_interactive;
-    int ngl = 99;
-    int n_ctx = 2048;
-};
+        // If help is requested, show help and exit
+        if (help_) {
+            help();
+            return 2;
+        }

-class ArgumentParser {
-   public:
-    ArgumentParser(const char * program_name) : program_name(program_name) {}
-
-    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
-        string_args[flag] = &var;
-        arguments.push_back({flag, help_text});
+        return 0;  // Success
    }

-    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
-        int_args[flag] = &var;
-        arguments.push_back({flag, help_text});
+    std::string model_;
+    std::string user_;
+    int         context_size_ = 2048, ngl_ = -1;
+
+  private:
+    std::string help_str_;
+    bool        help_ = false;
+
+    void construct_help_str_() {
+        help_str_ =
+            "Description:\n"
+            "  Runs a llm\n"
+            "\n"
+            "Usage:\n"
+            "  llama-run [options] model [prompt]\n"
+            "\n"
+            "Options:\n"
+            "  -c, --context-size <value>\n"
+            "      Context size (default: " +
+            std::to_string(context_size_);
+        help_str_ +=
+            ")\n"
+            "  -n, --ngl <value>\n"
+            "      Number of GPU layers (default: " +
+            std::to_string(ngl_);
+        help_str_ +=
+            ")\n"
+            "  -h, --help\n"
+            "      Show help message\n"
+            "\n"
+            "Commands:\n"
+            "  model\n"
+            "      Model is a string with an optional prefix of \n"
+            "      huggingface:// (hf://), ollama://, https:// or file://.\n"
+            "      If no protocol is specified and a file exists in the specified\n"
+            "      path, file:// is assumed, otherwise if a file does not exist in\n"
+            "      the specified path, ollama:// is assumed. Models that are being\n"
+            "      pulled are downloaded with .partial extension while being\n"
+            "      downloaded and then renamed as the file without the .partial\n"
+            "      extension when complete.\n"
+            "\n"
+            "Examples:\n"
+            "  llama-run llama3\n"
+            "  llama-run ollama://granite-code\n"
+            "  llama-run ollama://smollm:135m\n"
+            "  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
+            "  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
+            "  llama-run https://example.com/some-file1.gguf\n"
+            "  llama-run some-file2.gguf\n"
+            "  llama-run file://some-file3.gguf\n"
+            "  llama-run --ngl 99 some-file4.gguf\n"
+            "  llama-run --ngl 99 some-file5.gguf Hello World\n";
    }

    int parse(int argc, const char ** argv) {
+        int positional_args_i = 0;
        for (int i = 1; i < argc; ++i) {
-            std::string arg = argv[i];
-            if (string_args.count(arg)) {
-                if (i + 1 < argc) {
-                    *string_args[arg] = argv[++i];
-                } else {
-                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
-                    print_usage();
+            if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
+                if (i + 1 >= argc) {
                    return 1;
                }
-            } else if (int_args.count(arg)) {
-                if (i + 1 < argc) {
-                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
-                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
-                        print_usage();
-                        return 1;
-                    }
-                } else {
-                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
-                    print_usage();
+
+                context_size_ = std::atoi(argv[++i]);
+            } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
+                if (i + 1 >= argc) {
                    return 1;
                }
+
+                ngl_ = std::atoi(argv[++i]);
+            } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+                help_ = true;
+                return 0;
+            } else if (!positional_args_i) {
+                ++positional_args_i;
+                model_ = argv[i];
+            } else if (positional_args_i == 1) {
+                ++positional_args_i;
+                user_ = argv[i];
            } else {
-                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
-                print_usage();
-                return 1;
+                user_ += " " + std::string(argv[i]);
            }
        }

-        if (string_args["-m"]->empty()) {
-            fprintf(stderr, "error: -m is required\n");
-            print_usage();
+        return model_.empty();  // model_ is the only required value
+    }
+
+    void help() const { printf("%s", help_str_.c_str()); }
+};
+
+struct progress_data {
+    size_t file_size = 0;
+    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
+    bool   printed   = false;
+};
+
+struct FileDeleter {
+    void operator()(FILE * file) const {
+        if (file) {
+            fclose(file);
+        }
+    }
+};
+
+typedef std::unique_ptr<FILE, FileDeleter> FILE_ptr;
+
+#ifdef LLAMA_USE_CURL
+class CurlWrapper {
+  public:
+    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+             const bool progress, std::string * response_str = nullptr) {
+        std::string output_file_partial;
+        curl = curl_easy_init();
+        if (!curl) {
            return 1;
        }

+        progress_data data;
+        FILE_ptr      out;
+        if (!output_file.empty()) {
+            output_file_partial = output_file + ".partial";
+            out.reset(fopen(output_file_partial.c_str(), "ab"));
+        }
+
+        set_write_options(response_str, out);
+        data.file_size = set_resume_point(output_file_partial);
+        set_progress_options(progress, data);
+        set_headers(headers);
+        perform(url);
+        if (!output_file.empty()) {
+            std::filesystem::rename(output_file_partial, output_file);
+        }
+
        return 0;
    }

-   private:
-    const char * program_name;
-    std::unordered_map<std::string, std::string *> string_args;
-    std::unordered_map<std::string, int *> int_args;
-    std::vector<Argument> arguments;
+    ~CurlWrapper() {
+        if (chunk) {
+            curl_slist_free_all(chunk);
+        }

-    int parse_int_arg(const char * arg, int & value) {
-        char * end;
-        const long val = std::strtol(arg, &end, 10);
-        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
-            value = static_cast<int>(val);
+        if (curl) {
+            curl_easy_cleanup(curl);
+        }
+    }
+
+  private:
+    CURL *              curl  = nullptr;
+    struct curl_slist * chunk = nullptr;
+
+    void set_write_options(std::string * response_str, const FILE_ptr & out) {
+        if (response_str) {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
+        } else {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get());
+        }
+    }
+
+    size_t set_resume_point(const std::string & output_file) {
+        size_t file_size = 0;
+        if (std::filesystem::exists(output_file)) {
+            file_size = std::filesystem::file_size(output_file);
+            curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
+        }
+
+        return file_size;
+    }
+
+    void set_progress_options(bool progress, progress_data & data) {
+        if (progress) {
+            curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+            curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
+            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
+        }
+    }
+
+    void set_headers(const std::vector<std::string> & headers) {
+        if (!headers.empty()) {
+            if (chunk) {
+                curl_slist_free_all(chunk);
+                chunk = 0;
+            }
+
+            for (const auto & header : headers) {
+                chunk = curl_slist_append(chunk, header.c_str());
+            }
+
+            curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
+        }
+    }
+
+    void perform(const std::string & url) {
+        CURLcode res;
+        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+        curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
+        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
+        res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
+        }
+    }
+
+    static std::string human_readable_time(double seconds) {
+        int hrs  = static_cast<int>(seconds) / 3600;
+        int mins = (static_cast<int>(seconds) % 3600) / 60;
+        int secs = static_cast<int>(seconds) % 60;
+
+        std::ostringstream out;
+        if (hrs > 0) {
+            out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0')
+                << secs << "s";
+        } else if (mins > 0) {
+            out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s";
+        } else {
+            out << secs << "s";
+        }
+
+        return out.str();
+    }
+
+    static std::string human_readable_size(curl_off_t size) {
+        static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
+        char         length   = sizeof(suffix) / sizeof(suffix[0]);
+        int          i        = 0;
+        double       dbl_size = size;
+        if (size > 1024) {
+            for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
+                dbl_size = size / 1024.0;
+            }
+        }
+
+        std::ostringstream out;
+        out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i];
+        return out.str();
+    }
+
+    static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
+                                 curl_off_t) {
+        progress_data * data = static_cast<progress_data *>(ptr);
+        if (total_to_download <= 0) {
            return 0;
        }
-        return 1;
-    }

-    void print_usage() const {
-        printf("\nUsage:\n");
-        printf("  %s [OPTIONS]\n\n", program_name);
-        printf("Options:\n");
-        for (const auto & arg : arguments) {
-            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
+        total_to_download += data->file_size;
+        const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
+        const curl_off_t percentage                    = (now_downloaded_plus_file_size * 100) / total_to_download;
+        const curl_off_t pos                           = (percentage / 5);
+        std::string progress_bar;
+        for (int i = 0; i < 20; ++i) {
+            progress_bar.append((i < pos) ? "█" : " ");
        }

-        printf("\n");
+        // Calculate download speed and estimated time to completion
+        const auto                          now             = std::chrono::steady_clock::now();
+        const std::chrono::duration<double> elapsed_seconds = now - data->start_time;
+        const double                        speed           = now_downloaded / elapsed_seconds.count();
+        const double                        estimated_time  = (total_to_download - now_downloaded) / speed;
+        printe("\r%ld%% |%s| %s/%s  %.2f MB/s  %s      ", percentage, progress_bar.c_str(),
+               human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(),
+               speed / (1024 * 1024), human_readable_time(estimated_time).c_str());
+        fflush(stderr);
+        data->printed = true;
+
+        return 0;
+    }
+
+    // Function to write data to a file
+    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        FILE * out = static_cast<FILE *>(stream);
+        return fwrite(ptr, size, nmemb, out);
+    }
+
+    // Function to capture data into a string
+    static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        std::string * str = static_cast<std::string *>(stream);
+        str->append(static_cast<char *>(ptr), size * nmemb);
+        return size * nmemb;
    }
 };
+#endif

 class LlamaData {
-   public:
-    llama_model_ptr model;
-    llama_sampler_ptr sampler;
-    llama_context_ptr context;
+  public:
+    llama_model_ptr                 model;
+    llama_sampler_ptr               sampler;
+    llama_context_ptr               context;
    std::vector<llama_chat_message> messages;
+    std::vector<std::string>        msg_strs;
+    std::vector<char>               fmtted;

-    int init(const Options & opt) {
-        model = initialize_model(opt.model_path, opt.ngl);
+    int init(Opt & opt) {
+        model = initialize_model(opt);
        if (!model) {
            return 1;
        }

-        context = initialize_context(model, opt.n_ctx);
+        context = initialize_context(model, opt.context_size_);
        if (!context) {
            return 1;
        }
@ -131,15 +353,123 @@ class LlamaData {
        return 0;
    }

-   private:
-    // Initializes the model and returns a unique pointer to it
-    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
-        llama_model_params model_params = llama_model_default_params();
-        model_params.n_gpu_layers = ngl;
+  private:
+#ifdef LLAMA_USE_CURL
+    int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+                 const bool progress, std::string * response_str = nullptr) {
+        CurlWrapper curl;
+        if (curl.init(url, headers, output_file, progress, response_str)) {
+            return 1;
+        }

-        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
+        return 0;
+    }
+#else
+    int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
+                 std::string * = nullptr) {
+        printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+        return 1;
+    }
+#endif
+
+    int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        // Find the second occurrence of '/' after protocol string
+        size_t pos = model.find('/');
+        pos        = model.find('/', pos + 1);
+        if (pos == std::string::npos) {
+            return 1;
+        }
+
+        const std::string hfr = model.substr(0, pos);
+        const std::string hff = model.substr(pos + 1);
+        const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+        return download(url, headers, bn, true);
+    }
+
+    int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        if (model.find('/') == std::string::npos) {
+            model = "library/" + model;
+        }
+
+        std::string model_tag = "latest";
+        size_t      colon_pos = model.find(':');
+        if (colon_pos != std::string::npos) {
+            model_tag = model.substr(colon_pos + 1);
+            model     = model.substr(0, colon_pos);
+        }
+
+        std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
+        std::string manifest_str;
+        const int   ret = download(manifest_url, headers, "", false, &manifest_str);
+        if (ret) {
+            return ret;
+        }
+
+        nlohmann::json manifest = nlohmann::json::parse(manifest_str);
+        std::string    layer;
+        for (const auto & l : manifest["layers"]) {
+            if (l["mediaType"] == "application/vnd.ollama.image.model") {
+                layer = l["digest"];
+                break;
+            }
+        }
+
+        std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
+        return download(blob_url, headers, bn, true);
+    }
+
+    std::string basename(const std::string & path) {
+        const size_t pos = path.find_last_of("/\\");
+        if (pos == std::string::npos) {
+            return path;
+        }
+
+        return path.substr(pos + 1);
+    }
+
+    int remove_proto(std::string & model_) {
+        const std::string::size_type pos = model_.find("://");
+        if (pos == std::string::npos) {
+            return 1;
+        }
+
+        model_ = model_.substr(pos + 3);  // Skip past "://"
+        return 0;
+    }
+
+    int resolve_model(std::string & model_) {
+        const std::string              bn      = basename(model_);
+        const std::vector<std::string> headers = { "--header",
+                                                   "Accept: application/vnd.docker.distribution.manifest.v2+json" };
+        int                            ret     = 0;
+        if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) {
+            remove_proto(model_);
+        } else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
+            remove_proto(model_);
+            ret = huggingface_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "ollama://")) {
+            remove_proto(model_);
+            ret = ollama_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "https://")) {
+            download(model_, headers, bn, true);
+        } else {
+            ret = ollama_dl(model_, headers, bn);
+        }
+
+        model_ = bn;
+
+        return ret;
+    }
+
+    // Initializes the model and returns a unique pointer to it
+    llama_model_ptr initialize_model(Opt & opt) {
+        ggml_backend_load_all();
+        llama_model_params model_params = llama_model_default_params();
+        model_params.n_gpu_layers       = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
+        resolve_model(opt.model_);
+        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
        if (!model) {
-            fprintf(stderr, "%s: error: unable to load model\n", __func__);
+            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }

        return model;
@ -148,12 +478,11 @@ class LlamaData {
    // Initializes the context with the specified parameters
    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
        llama_context_params ctx_params = llama_context_default_params();
-        ctx_params.n_ctx = n_ctx;
-        ctx_params.n_batch = n_ctx;
-
+        ctx_params.n_ctx                = n_ctx;
+        ctx_params.n_batch              = n_ctx;
        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
        if (!context) {
-            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+            printe("%s: error: failed to create the llama_context\n", __func__);
        }

        return context;
@ -170,23 +499,22 @@ class LlamaData {
    }
 };

-// Add a message to `messages` and store its content in `owned_content`
-static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
-                        std::vector<char_array_ptr> & owned_content) {
-    char_array_ptr content(new char[text.size() + 1]);
-    std::strcpy(content.get(), text.c_str());
-    llama_data.messages.push_back({role, content.get()});
-    owned_content.push_back(std::move(content));
+// Add a message to `messages` and store its content in `msg_strs`
+static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
+    llama_data.msg_strs.push_back(std::move(text));
+    llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
 }

 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
-    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
-                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
-    if (result > static_cast<int>(formatted.size())) {
-        formatted.resize(result);
+static int apply_chat_template(LlamaData & llama_data, const bool append) {
+    int result = llama_chat_apply_template(
+        llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
+        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
+    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
+        llama_data.fmtted.resize(result);
        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
-                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
+                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
+                                           llama_data.fmtted.size());
    }

    return result;
@ -199,7 +527,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
    prompt_tokens.resize(n_prompt_tokens);
    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
                       true) < 0) {
-        GGML_ABORT("failed to tokenize the prompt\n");
+        printe("failed to tokenize the prompt\n");
+        return -1;
    }

    return n_prompt_tokens;
@ -207,11 +536,11 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr

 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
-    const int n_ctx = llama_n_ctx(ctx.get());
+    const int n_ctx      = llama_n_ctx(ctx.get());
    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
    if (n_ctx_used + batch.n_tokens > n_ctx) {
        printf("\033[0m\n");
-        fprintf(stderr, "context size exceeded\n");
+        printe("context size exceeded\n");
        return 1;
    }

@ -221,9 +550,10 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
 // convert the token to a string
 static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
    char buf[256];
-    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
+    int  n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
    if (n < 0) {
-        GGML_ABORT("failed to convert token to piece\n");
+        printe("failed to convert token to piece\n");
+        return 1;
    }

    piece = std::string(buf, n);
@ -238,19 +568,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st

 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
-    std::vector<llama_token> prompt_tokens;
-    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
-    if (n_prompt_tokens < 0) {
+    std::vector<llama_token> tokens;
+    if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
        return 1;
    }

    // prepare a batch for the prompt
-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
    llama_token new_token_id;
    while (true) {
        check_context_size(llama_data.context, batch);
        if (llama_decode(llama_data.context.get(), batch)) {
-            GGML_ABORT("failed to decode\n");
+            printe("failed to decode\n");
+            return 1;
        }

        // sample the next token, check is it an end of generation?
@ -273,22 +603,9 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
    return 0;
 }

-static int parse_arguments(const int argc, const char ** argv, Options & opt) {
-    ArgumentParser parser(argv[0]);
-    parser.add_argument("-m", opt.model_path, "model");
-    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
-    parser.add_argument("-c", opt.n_ctx, "context_size");
-    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
-    if (parser.parse(argc, argv)) {
-        return 1;
-    }
-
-    return 0;
-}
-
 static int read_user_input(std::string & user) {
    std::getline(std::cin, user);
-    return user.empty();  // Indicate an error or empty input
+    return user.empty();  // Should have data in happy path
 }

 // Function to generate a response based on the prompt
@ -296,7 +613,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
    // Set response color
    printf("\033[33m");
    if (generate(llama_data, prompt, response)) {
-        fprintf(stderr, "failed to generate response\n");
+        printe("failed to generate response\n");
        return 1;
    }

@ -306,11 +623,10 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
 }

 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
-                                                   const bool is_user_input, int & output_length) {
-    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
+static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
+    const int new_len = apply_chat_template(llama_data, append);
    if (new_len < 0) {
-        fprintf(stderr, "failed to apply the chat template\n");
+        printe("failed to apply the chat template\n");
        return -1;
    }

@ -319,56 +635,63 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data,
 }

 // Helper function to handle user input
-static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
-    if (!prompt_non_interactive.empty()) {
-        user_input = prompt_non_interactive;
-        return true;  // No need for interactive input
+static int handle_user_input(std::string & user_input, const std::string & user_) {
+    if (!user_.empty()) {
+        user_input = user_;
+        return 0;  // No need for interactive input
    }

-    printf("\033[32m> \033[0m");
-    return !read_user_input(user_input);  // Returns false if input ends the loop
+    printf(
+        "\r                                                                       "
+        "\r\033[32m> \033[0m");
+    return read_user_input(user_input);  // Returns true if input ends the loop
 }

 // Function to tokenize the prompt
-static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
-    std::vector<char_array_ptr> owned_content;
-    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
+static int chat_loop(LlamaData & llama_data, const std::string & user_) {
    int prev_len = 0;
-
+    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
    while (true) {
        // Get user input
        std::string user_input;
-        if (!handle_user_input(user_input, prompt_non_interactive)) {
-            break;
+        while (handle_user_input(user_input, user_)) {
        }

-        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
-                    owned_content);
-
+        add_message("user", user_.empty() ? user_input : user_, llama_data);
        int new_len;
-        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
+        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
            return 1;
        }

-        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
+        std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
        std::string response;
        if (generate_response(llama_data, prompt, response)) {
            return 1;
        }
+
+        if (!user_.empty()) {
+            break;
+        }
+
+        add_message("assistant", response, llama_data);
+        if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
+            return 1;
+        }
    }
+
    return 0;
 }

 static void log_callback(const enum ggml_log_level level, const char * text, void *) {
    if (level == GGML_LOG_LEVEL_ERROR) {
-        fprintf(stderr, "%s", text);
+        printe("%s", text);
    }
 }

 static bool is_stdin_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
-    DWORD mode;
+    DWORD  mode;
    return GetConsoleMode(hStdin, &mode);
 #else
    return isatty(STDIN_FILENO);
@ -382,17 +705,20 @@ static std::string read_pipe_data() {
 }

 int main(int argc, const char ** argv) {
-    Options opt;
-    if (parse_arguments(argc, argv, opt)) {
+    Opt       opt;
+    const int ret = opt.init(argc, argv);
+    if (ret == 2) {
+        return 0;
+    } else if (ret) {
        return 1;
    }

    if (!is_stdin_a_terminal()) {
-        if (!opt.prompt_non_interactive.empty()) {
-            opt.prompt_non_interactive += "\n\n";
+        if (!opt.user_.empty()) {
+            opt.user_ += "\n\n";
        }

-        opt.prompt_non_interactive += read_pipe_data();
+        opt.user_ += read_pipe_data();
    }

    llama_log_set(log_callback, nullptr);
@ -401,7 +727,7 @@ int main(int argc, const char ** argv) {
        return 1;
    }

-    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
+    if (chat_loop(llama_data, opt.user_)) {
        return 1;
    }

--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -16,12 +16,7 @@ set(TARGET_SRCS
 )
 set(PUBLIC_ASSETS
    index.html
-    completion.js
    loading.html
-    deps_daisyui.min.css
-    deps_markdown-it.js
-    deps_tailwindcss.js
-    deps_vue.esm-browser.js
 )

 foreach(asset ${PUBLIC_ASSETS})
@ -33,6 +28,7 @@ foreach(asset ${PUBLIC_ASSETS})
        OUTPUT "${output}"
        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
    )
+    set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
 endforeach()

 add_executable(${TARGET} ${TARGET_SRCS})
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
 | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
 | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
+| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
+| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co
 | -------- | ----------- |
 | `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
 | `-sp, --special` | special tokens output enabled (default: false) |
+| `--no-warmup` | skip warming up the model with an empty run |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
@ -146,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
+| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@ -163,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
 | `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |


 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
@ -217,6 +219,37 @@ services:
  cmake --build build --config Release -t llama-server
  ```

+## Web UI
+
+The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.
+
+The web UI is developed using:
+- `vue` framework for frontend development
+- `tailwindcss` and `daisyui` for styling
+- `vite` for build tooling
+
+A pre-built version is available as a single HTML file under `/public` directory.
+
+To build or to run the dev server (with hot reload):
+
+```sh
+# make sure you have nodejs installed
+cd examples/server/webui
+npm i
+
+# to run the dev server
+npm run dev
+
+# to build the public/index.html
+npm run build
+```
+
+NOTE: if you are using the vite dev server, you can change the API base URL to llama.cpp. To do that, run this code snippet in browser's console:
+
+```js
+localStorage.setItem('base', 'http://localhost:8080')
+```
+
 ## Quick Start

 To get started right away, run the following command, making sure to use the correct path for the model you have:
@ -271,23 +304,23 @@ mkdir llama-client
 cd llama-client
 ```

-Create a index.js file and put this inside:
+Create an index.js file and put this inside:

 ```javascript
-const prompt = `Building a website can be done in 10 simple steps:`;
+const prompt = "Building a website can be done in 10 simple steps:"

-async function Test() {
+async function test() {
    let response = await fetch("http://127.0.0.1:8080/completion", {
-        method: 'POST',
+        method: "POST",
        body: JSON.stringify({
            prompt,
-            n_predict: 512,
+            n_predict: 64,
        })
    })
    console.log((await response.json()).content)
 }

-Test()
+test()
 ```

 And run it:
@ -311,110 +344,110 @@ node index.js

 ### POST `/completion`: Given a `prompt`, it returns the predicted completion.

-    *Options:*
+*Options:*

-    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
+`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:

-      - The prompt is a string or an array with the first element given as a string
-      - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
+  - The prompt is a string or an array with the first element given as a string
+  - The model's `tokenizer.ggml.add_bos_token` metadata is `true`

-    These input shapes and data type are allowed for `prompt`:
+These input shapes and data type are allowed for `prompt`:

-      - Single string: `"string"`
-      - Single sequence of tokens: `[12, 34, 56]`
-      - Mixed tokens and strings: `[12, 34, "string", 56, 78]`
+  - Single string: `"string"`
+  - Single sequence of tokens: `[12, 34, 56]`
+  - Mixed tokens and strings: `[12, 34, "string", 56, 78]`

-    Multiple prompts are also supported. In this case, the completion result will be an array.
+Multiple prompts are also supported. In this case, the completion result will be an array.

-      - Only strings: `["string1", "string2"]`
-      - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
-      - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`
+  - Only strings: `["string1", "string2"]`
+  - Strings and sequences of tokens: `["string1", [12, 34, 56]]`
+  - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]`

-    `temperature`: Adjust the randomness of the generated text. Default: `0.8`
+`temperature`: Adjust the randomness of the generated text. Default: `0.8`

-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
+`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.

-    `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
+`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`

-    `top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
+`top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`

-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
+`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`

-    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
+`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`

-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
+`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.

-    `n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
+`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`

-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
-    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
+`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
+By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.

-    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.

-    `stop`: Specify a JSON array of stopping strings.
-    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
+`stop`: Specify a JSON array of stopping strings.
+These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`

-    `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
+`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.

-    `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
+`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`

-    `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
+`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.

-    `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
+`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`

-    `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
+`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.

-    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
+`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.

-    `dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
+`dry_multiplier`: Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.

-    `dry_base`: Set the DRY repetition penalty base value. Default: `1.75`
+`dry_base`: Set the DRY repetition penalty base value. Default: `1.75`

-    `dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
+`dry_allowed_length`: Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`

-    `dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
+`dry_penalty_last_n`: How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.

-    `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
+`dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`

-    `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
+`xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.

-    `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
+`xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)

-    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
+`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.

-    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
+`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`

-    `mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
+`mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`

-    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar
+`grammar`: Set grammar for grammar-based sampling.  Default: no grammar

-    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
+`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.

-    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
+`seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.

-    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
+`ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`

-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
+`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`

-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
+`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`

-    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
+`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`

-    `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
+`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.

-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
+`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`

-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
+`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`

-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
+`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.

-    `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+`timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`

 **Response format**

- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
+- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.

 - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:

@ -442,9 +475,11 @@ Notice that each `probs` is an array of length `n_probs`.
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+- `stop_type`: Indicating whether the completion has stopped. Possible values are:
+  - `none`: Generating (not stopped)
+  - `eos`: Stopped because it encountered the EOS token
+  - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+  - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
 - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
@ -453,13 +488,13 @@ Notice that each `probs` is an array of length `n_probs`.

 ### POST `/tokenize`: Tokenize a given text

-    *Options:*
+*Options:*

-    `content`: (Required) The text to tokenize.
+`content`: (Required) The text to tokenize.

-    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

-    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`

 **Response:**

@ -496,52 +531,52 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k

 ### POST `/detokenize`: Convert tokens to text

-    *Options:*
+*Options:*

-    `tokens`: Set the tokens to detokenize.
+`tokens`: Set the tokens to detokenize.

 ### POST `/embedding`: Generate embedding of a given text

 The same as [the embedding example](../embedding) does.

-    *Options:*
+*Options:*

-    `content`: Set the text to process.
+`content`: Set the text to process.

-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

 ### POST `/reranking`: Rerank documents according to a given query

 Similar to https://jina.ai/reranker/ but might change in the future.
 Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.

-    *Options:*
+*Options:*

-    `query`: The query against which the documents will be ranked.
+`query`: The query against which the documents will be ranked.

-    `documents`: An array strings representing the documents to be ranked.
+`documents`: An array strings representing the documents to be ranked.

-    *Aliases:*
-      - `/rerank`
-      - `/v1/rerank`
-      - `/v1/reranking`
+*Aliases:*
+  - `/rerank`
+  - `/v1/rerank`
+  - `/v1/reranking`

-    *Examples:*
+*Examples:*

-    ```shell
-    curl http://127.0.0.1:8012/v1/rerank \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "some-model",
-                "query": "What is panda?",
-                "top_n": 3,
-                "documents": [
-                    "hi",
-                "it is a bear",
-                "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
-                ]
-        }' | jq
-    ```
+```shell
+curl http://127.0.0.1:8012/v1/rerank \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "some-model",
+            "query": "What is panda?",
+            "top_n": 3,
+            "documents": [
+                "hi",
+            "it is a bear",
+            "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+            ]
+    }' | jq
+```

 ### POST `/infill`: For code infilling.

@ -585,14 +620,83 @@ This endpoint is public (no API key check). By default, it is read-only. To make

 ```json
 {
-  "default_generation_settings": { ... },
+  "default_generation_settings": {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "penalize_nl": false,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
+    }
+  },
  "total_slots": 1,
-  "chat_template": ""
+  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
+  "chat_template": "..."
 }
 ```

 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
+- `model_path` - the path to model file (same with `-m` argument)
 - `chat_template` - the model's original Jinja2 prompt template

 ### POST `/props`: Change server global properties.
@ -607,89 +711,89 @@ To use this endpoint with POST method, you need to start server with `--props`

 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.

-    *Options:*
+*Options:*

-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

-    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
+The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.

-    *Examples:*
+*Examples:*

-    You can use either Python `openai` library with appropriate checkpoints:
+You can use either Python `openai` library with appropriate checkpoints:

-    ```python
-    import openai
+```python
+import openai

-    client = openai.OpenAI(
-        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
-        api_key = "sk-no-key-required"
-    )
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)

-    completion = client.chat.completions.create(
-    model="gpt-3.5-turbo",
-    messages=[
-        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
-        {"role": "user", "content": "Write a limerick about python exceptions"}
-    ]
-    )
+completion = client.chat.completions.create(
+model="gpt-3.5-turbo",
+messages=[
+    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+    {"role": "user", "content": "Write a limerick about python exceptions"}
+]
+)

-    print(completion.choices[0].message)
-    ```
+print(completion.choices[0].message)
+```

-    ... or raw HTTP requests:
+... or raw HTTP requests:

-    ```shell
-    curl http://localhost:8080/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-    {
-        "role": "system",
-        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
-    },
-    {
-        "role": "user",
-        "content": "Write a limerick about python exceptions"
-    }
-    ]
-    }'
-    ```
+```shell
+curl http://localhost:8080/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-3.5-turbo",
+"messages": [
+{
+    "role": "system",
+    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+},
+{
+    "role": "user",
+    "content": "Write a limerick about python exceptions"
+}
+]
+}'
+```

 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API

-    *Options:*
+*Options:*

-    See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).

-    *Examples:*
+*Examples:*

-  - input as string
+- input as string

-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": "hello",
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": "hello",
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```

-  - `input` as string array
+- `input` as string array

-    ```shell
-    curl http://localhost:8080/v1/embeddings \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-            "input": ["hello", "world"],
-            "model":"GPT-4",
-            "encoding_format": "float"
-    }'
-    ```
+  ```shell
+  curl http://localhost:8080/v1/embeddings \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+          "input": ["hello", "world"],
+          "model":"GPT-4",
+          "encoding_format": "float"
+  }'
+  ```

 ### GET `/slots`: Returns the current slots processing state

@ -706,56 +810,74 @@ Example:

 ```json
 [
-    {
-        "dynatemp_exponent": 1.0,
-        "dynatemp_range": 0.0,
-        "frequency_penalty": 0.0,
-        "grammar": "",
-        "id": 0,
-        "ignore_eos": false,
-        "is_processing": false,
-        "logit_bias": [],
-        "min_p": 0.05000000074505806,
-        "mirostat": 0,
-        "mirostat_eta": 0.10000000149011612,
-        "mirostat_tau": 5.0,
-        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
-        "n_ctx": 2048,
-        "n_keep": 0,
-        "n_predict": 100000,
-        "n_probs": 0,
-        "next_token": {
-            "has_next_token": true,
-            "n_remain": -1,
-            "n_decoded": 0,
-            "stopped_eos": false,
-            "stopped_limit": false,
-            "stopped_word": false,
-            "stopping_word": ""
-        },
-        "penalize_nl": true,
-        "presence_penalty": 0.0,
-        "prompt": "Say hello to llama.cpp",
-        "repeat_last_n": 64,
-        "repeat_penalty": 1.100000023841858,
-        "samplers": [
-            "top_k",
-            "typical_p",
-            "top_p",
-            "min_p",
-            "temperature"
-        ],
-        "seed": 42,
-        "stop": [
-            "\n"
-        ],
-        "stream": false,
-        "task_id": 0,
-        "temperature": 0.0,
-        "top_k": 40,
-        "top_p": 0.949999988079071,
-        "typical_p": 1.0
+  {
+    "id": 0,
+    "id_task": -1,
+    "n_ctx": 1024,
+    "speculative": false,
+    "is_processing": false,
+    "params": {
+      "n_predict": -1,
+      "seed": 4294967295,
+      "temperature": 0.800000011920929,
+      "dynatemp_range": 0.0,
+      "dynatemp_exponent": 1.0,
+      "top_k": 40,
+      "top_p": 0.949999988079071,
+      "min_p": 0.05000000074505806,
+      "xtc_probability": 0.0,
+      "xtc_threshold": 0.10000000149011612,
+      "typical_p": 1.0,
+      "repeat_last_n": 64,
+      "repeat_penalty": 1.0,
+      "presence_penalty": 0.0,
+      "frequency_penalty": 0.0,
+      "dry_multiplier": 0.0,
+      "dry_base": 1.75,
+      "dry_allowed_length": 2,
+      "dry_penalty_last_n": -1,
+      "dry_sequence_breakers": [
+        "\n",
+        ":",
+        "\"",
+        "*"
+      ],
+      "mirostat": 0,
+      "mirostat_tau": 5.0,
+      "mirostat_eta": 0.10000000149011612,
+      "penalize_nl": false,
+      "stop": [],
+      "max_tokens": -1,
+      "n_keep": 0,
+      "n_discard": 0,
+      "ignore_eos": false,
+      "stream": true,
+      "n_probs": 0,
+      "min_keep": 0,
+      "grammar": "",
+      "samplers": [
+        "dry",
+        "top_k",
+        "typ_p",
+        "top_p",
+        "min_p",
+        "xtc",
+        "temperature"
+      ],
+      "speculative.n_max": 16,
+      "speculative.n_min": 5,
+      "speculative.p_min": 0.8999999761581421,
+      "timings_per_token": false
+    },
+    "prompt": "",
+    "next_token": {
+      "has_next_token": true,
+      "has_new_line": false,
+      "n_remain": -1,
+      "n_decoded": 0,
+      "stopping_word": ""
    }
+  }
 ]
 ```

@ -775,9 +897,9 @@ Available metrics:

 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.

-    *Options:*
+*Options:*

-    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
+`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.

 **Response format**

@ -795,9 +917,9 @@ Available metrics:

 ### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.

-    *Options:*
+*Options:*

-    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.

 **Response format**

--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-# Download and update deps for binary
-
-# get the directory of this script file
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-PUBLIC=$DIR/public
-
-echo "download js bundle files"
-
-# Note for contributors: Always pin to a specific version "maj.min.patch" to avoid breaking the CI
-
-curl -L https://cdn.tailwindcss.com/3.4.14 > $PUBLIC/deps_tailwindcss.js
-echo >> $PUBLIC/deps_tailwindcss.js # add newline
-
-curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/styled.min.css > $PUBLIC/deps_daisyui.min.css
-curl -L https://cdnjs.cloudflare.com/ajax/libs/daisyui/4.12.14/themes.min.css >> $PUBLIC/deps_daisyui.min.css
-echo >> $PUBLIC/deps_daisyui.min.css # add newline
-
-curl -L https://unpkg.com/vue@3.5.12/dist/vue.esm-browser.js > $PUBLIC/deps_vue.esm-browser.js
-echo >> $PUBLIC/deps_vue.esm-browser.js # add newline
-
-curl -L https://cdnjs.cloudflare.com/ajax/libs/markdown-it/13.0.2/markdown-it.js > $PUBLIC/deps_markdown-it.js
-echo >> $PUBLIC/deps_markdown-it.js # add newline
-
-ls -lah $PUBLIC
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@ -1,225 +0,0 @@
-const paramDefaults = {
-  stream: true,
-  temperature: 0.2,
-};
-
-let generation_settings = null;
-
-export class CompletionError extends Error {
-  constructor(message, name, data) {
-    super(message);
-    this.name = name;
-  }
-};
-
-// Completes the prompt as a generator. Recommended for most use cases.
-//
-// Example:
-//
-//    import { llama } from '/completion.js'
-//
-//    const request = llama("Tell me a joke", {n_predict: 800})
-//    for await (const chunk of request) {
-//      document.write(chunk.data.content)
-//    }
-//
-export async function* llama(prompt, params = {}, config = {}) {
-  let controller = config.controller;
-  const api_url = config.api_url?.replace(/\/+$/, '') || "";
-
-  if (!controller) {
-    controller = new AbortController();
-  }
-
-  const completionParams = { ...paramDefaults, ...params, prompt };
-
-  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
-    method: 'POST',
-    body: JSON.stringify(completionParams),
-    headers: {
-      'Connection': 'keep-alive',
-      'Content-Type': 'application/json',
-      'Accept': 'text/event-stream',
-      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
-    },
-    signal: controller.signal,
-  });
-
-  const status = response.status;
-  if (status !== 200) {
-    try {
-      const body = await response.json();
-      if (body && body.error && body.error.message) {
-        throw new CompletionError(body.error.message, 'ServerError');
-      }
-    } catch (err) {
-      throw new CompletionError(err.message, 'ServerError');
-    }
-  }
-
-  const reader = response.body.getReader();
-  const decoder = new TextDecoder();
-
-  let content = "";
-  let leftover = ""; // Buffer for partially read lines
-
-  try {
-    let cont = true;
-
-    while (cont) {
-      const result = await reader.read();
-      if (result.done) {
-        break;
-      }
-
-      // Add any leftover data to the current chunk of data
-      const text = leftover + decoder.decode(result.value);
-
-      // Check if the last character is a line break
-      const endsWithLineBreak = text.endsWith('\n');
-
-      // Split the text into lines
-      let lines = text.split('\n');
-
-      // If the text doesn't end with a line break, then the last line is incomplete
-      // Store it in leftover to be added to the next chunk of data
-      if (!endsWithLineBreak) {
-        leftover = lines.pop();
-      } else {
-        leftover = ""; // Reset leftover if we have a line break at the end
-      }
-
-      // Parse all sse events and add them to result
-      const regex = /^(\S+):\s(.*)$/gm;
-      for (const line of lines) {
-        const match = regex.exec(line);
-        if (match) {
-          result[match[1]] = match[2];
-          if (result.data === '[DONE]') {
-            cont = false;
-            break;
-          }
-
-          // since we know this is llama.cpp, let's just decode the json in data
-          if (result.data) {
-            result.data = JSON.parse(result.data);
-            content += result.data.content;
-
-            // yield
-            yield result;
-
-            // if we got a stop token from server, we will break here
-            if (result.data.stop) {
-              if (result.data.generation_settings) {
-                generation_settings = result.data.generation_settings;
-              }
-              cont = false;
-              break;
-            }
-          }
-          if (result.error) {
-            try {
-              result.error = JSON.parse(result.error);
-              if (result.error.message.includes('slot unavailable')) {
-                // Throw an error to be caught by upstream callers
-                throw new Error('slot unavailable');
-              } else {
-                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
-              }
-            } catch(e) {
-              console.error(`llama.cpp error ${result.error}`)
-            }
-          }
-        }
-      }
-    }
-  } catch (e) {
-    if (e.name !== 'AbortError') {
-      console.error("llama error: ", e);
-    }
-    throw e;
-  }
-  finally {
-    controller.abort();
-  }
-
-  return content;
-}
-
-// Call llama, return an event target that you can subscribe to
-//
-// Example:
-//
-//    import { llamaEventTarget } from '/completion.js'
-//
-//    const conn = llamaEventTarget(prompt)
-//    conn.addEventListener("message", (chunk) => {
-//      document.write(chunk.detail.content)
-//    })
-//
-export const llamaEventTarget = (prompt, params = {}, config = {}) => {
-  const eventTarget = new EventTarget();
-  (async () => {
-    let content = "";
-    for await (const chunk of llama(prompt, params, config)) {
-      if (chunk.data) {
-        content += chunk.data.content;
-        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
-      }
-      if (chunk.data.generation_settings) {
-        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
-      }
-      if (chunk.data.timings) {
-        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
-      }
-    }
-    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
-  })();
-  return eventTarget;
-}
-
-// Call llama, return a promise that resolves to the completed text. This does not support streaming
-//
-// Example:
-//
-//     llamaPromise(prompt).then((content) => {
-//       document.write(content)
-//     })
-//
-//     or
-//
-//     const content = await llamaPromise(prompt)
-//     document.write(content)
-//
-export const llamaPromise = (prompt, params = {}, config = {}) => {
-  return new Promise(async (resolve, reject) => {
-    let content = "";
-    try {
-      for await (const chunk of llama(prompt, params, config)) {
-        content += chunk.data.content;
-      }
-      resolve(content);
-    } catch (error) {
-      reject(error);
-    }
-  });
-};
-
-/**
- * (deprecated)
- */
-export const llamaComplete = async (params, controller, callback) => {
-  for await (const chunk of llama(params.prompt, params, { controller })) {
-    callback(chunk);
-  }
-}
-
-// Get the model info from the server. This is useful for getting the context window and so on.
-export const llamaModelInfo = async (config = {}) => {
-  if (!generation_settings) {
-    const api_url = config.api_url?.replace(/\/+$/, '') || "";
-    const props = await fetch(`${api_url}/props`).then(r => r.json());
-    generation_settings = props.default_generation_settings;
-  }
-  return generation_settings;
-}
--- a/examples/server/public/deps_daisyui.min.css
+++ b/examples/server/public/deps_daisyui.min.css
--- a/examples/server/public/deps_markdown-it.js
+++ b/examples/server/public/deps_markdown-it.js
--- a/examples/server/public/deps_tailwindcss.js
+++ b/examples/server/public/deps_tailwindcss.js
--- a/examples/server/public/deps_vue.esm-browser.js
+++ b/examples/server/public/deps_vue.esm-browser.js
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@ -407,6 +407,9 @@ class SimpleChat {
                if (curLine.startsWith("data:")) {
                    curLine = curLine.substring(5);
                }
+                if (curLine.trim() === "[DONE]") {
+                    break;
+                }
                let curJson = JSON.parse(curLine);
                console.debug("DBUG:SC:PART:Json:", curJson);
                this.append_response(this.response_extract_stream(curJson, apiEP));
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
 DEBUG=1 ./tests.sh -s -v -x
 ```

+Hint: You can compile and run test in single command, useful for local developement:
+
+```shell
+cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh
+```
+
 To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -1,5 +1,9 @@
 #!/bin/bash

+# make sure we are in the right directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
 set -eu

 if [ $# -lt 1 ]
--- a/examples/server/tests/unit/test_basic.py
+++ b/examples/server/tests/unit/test_basic.py
@ -1,4 +1,5 @@
 import pytest
+import requests
 from utils import *

 server = ServerPreset.tinyllama2()
@ -22,7 +23,12 @@ def test_server_props():
    server.start()
    res = server.make_request("GET", "/props")
    assert res.status_code == 200
+    assert ".gguf" in res.body["model_path"]
    assert res.body["total_slots"] == server.n_slots
+    default_val = res.body["default_generation_settings"]
+    assert server.n_ctx is not None and server.n_slots is not None
+    assert default_val["n_ctx"] == server.n_ctx / server.n_slots
+    assert default_val["params"]["seed"] == server.seed


 def test_server_models():
@ -33,6 +39,31 @@ def test_server_models():
    assert len(res.body["data"]) == 1
    assert res.body["data"][0]["id"] == server.model_alias

+
+def test_server_slots():
+    global server
+
+    # without slots endpoint enabled, this should return error
+    server.server_slots = False
+    server.start()
+    res = server.make_request("GET", "/slots")
+    assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
+    assert "error" in res.body
+    server.stop()
+
+    # with slots endpoint enabled, this should return slots info
+    server.server_slots = True
+    server.n_slots = 2
+    server.start()
+    res = server.make_request("GET", "/slots")
+    assert res.status_code == 200
+    assert len(res.body) == server.n_slots
+    assert server.n_ctx is not None and server.n_slots is not None
+    assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
+    assert "params" in res.body[0]
+    assert res.body[0]["params"]["seed"] == server.seed
+
+
 def test_load_split_model():
    global server
    server.model_hf_repo = "ggml-org/models"
@ -46,3 +77,20 @@ def test_load_split_model():
    })
    assert res.status_code == 200
    assert match_regex("(little|girl)+", res.body["content"])
+
+
+def test_no_webui():
+    global server
+    # default: webui enabled
+    server.start()
+    url = f"http://{server.server_host}:{server.server_port}"
+    res = requests.get(url)
+    assert res.status_code == 200
+    assert "<html>" in res.text
+    server.stop()
+
+    # with --no-webui
+    server.no_webui = True
+    server.start()
+    res = requests.get(url)
+    assert res.status_code == 404
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -12,13 +12,13 @@ def create_server():


@pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
    [
-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
    ]
 )
-def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
    global server
    server.start()
    res = server.make_request("POST", "/chat/completions", data={
@ -30,29 +30,28 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
        ],
    })
    assert res.status_code == 200
+    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
+    assert res.body["model"] == model if model is not None else server.model_alias
    assert res.body["usage"]["prompt_tokens"] == n_prompt
    assert res.body["usage"]["completion_tokens"] == n_predicted
    choice = res.body["choices"][0]
    assert "assistant" == choice["message"]["role"]
    assert match_regex(re_content, choice["message"]["content"])
-    if truncated:
-        assert choice["finish_reason"] == "length"
-    else:
-        assert choice["finish_reason"] == "stop"
+    assert choice["finish_reason"] == finish_reason


@pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
    [
-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+        ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
    ]
 )
-def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
    global server
+    server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
    server.start()
    res = server.make_stream_request("POST", "/chat/completions", data={
-        "model": model,
        "max_tokens": max_tokens,
        "messages": [
            {"role": "system", "content": system_prompt},
@ -61,18 +60,19 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r
        "stream": True,
    })
    content = ""
+    last_cmpl_id = None
    for data in res:
        choice = data["choices"][0]
+        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
+        if last_cmpl_id is None:
+            last_cmpl_id = data["id"]
+        assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
        if choice["finish_reason"] in ["stop", "length"]:
            assert data["usage"]["prompt_tokens"] == n_prompt
            assert data["usage"]["completion_tokens"] == n_predicted
            assert "content" not in choice["delta"]
            assert match_regex(re_content, content)
-            # FIXME: not sure why this is incorrect in stream mode
-            # if truncated:
-            #   assert choice["finish_reason"] == "length"
-            # else:
-            #   assert choice["finish_reason"] == "stop"
+            assert choice["finish_reason"] == finish_reason
        else:
            assert choice["finish_reason"] is None
            content += choice["delta"]["content"]
@ -93,7 +93,7 @@ def test_chat_completion_with_openai_library():
        temperature=0.8,
    )
    print(res)
-    assert res.choices[0].finish_reason == "stop"
+    assert res.choices[0].finish_reason == "length"
    assert res.choices[0].message.content is not None
    assert match_regex("(Suddenly)+", res.choices[0].message.content)

--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@ -42,15 +42,39 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
    })
    content = ""
    for data in res:
+        assert "stop" in data and type(data["stop"]) == bool
        if data["stop"]:
            assert data["timings"]["prompt_n"] == n_prompt
            assert data["timings"]["predicted_n"] == n_predicted
            assert data["truncated"] == truncated
+            assert data["stop_type"] == "limit"
+            assert "generation_settings" in data
+            assert server.n_predict is not None
+            assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
+            assert data["generation_settings"]["seed"] == server.seed
            assert match_regex(re_content, content)
        else:
            content += data["content"]


+def test_completion_stream_vs_non_stream():
+    global server
+    server.start()
+    res_stream = server.make_stream_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+        "stream": True,
+    })
+    res_non_stream = server.make_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+    })
+    content_stream = ""
+    for data in res_stream:
+        content_stream += data["content"]
+    assert content_stream == res_non_stream.body["content"]
+
+
@pytest.mark.parametrize("n_slots", [1, 2])
 def test_consistent_result_same_seed(n_slots: int):
    global server
@ -221,3 +245,24 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
        assert len(res.body["content"]) > 10
        # FIXME: the result is not deterministic when using other slot than slot 0
        # assert match_regex(re_content, res.body["content"])
+
+
+def test_n_probs():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "probs" in tok
+        assert len(tok["probs"]) == 10
+        for prob in tok["probs"]:
+            assert "prob" in prob
+            assert "tok_str" in prob
+            assert 0.0 <= prob["prob"] <= 1.0
--- a/examples/server/tests/unit/test_infill.py
+++ b/examples/server/tests/unit/test_infill.py
@ -13,28 +13,28 @@ def test_infill_without_input_extra():
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
+    assert match_regex("(Ann|small|shiny)+", res.body["content"])


 def test_infill_with_input_extra():
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
        "input_extra": [{
            "filename": "llama.h",
            "text": "LLAMA_API int32_t llama_n_threads();\n"
        }],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
+    assert match_regex("(Dad|excited|park)+", res.body["content"])


@pytest.mark.parametrize("input_extra", [
@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra):
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
-        "prompt": "Complete this",
        "input_extra": [input_extra],
-        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n    int n_threads = llama_",
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 400
    assert "error" in res.body
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_with_qwen_model():
+    global server
+    server.model_file = None
+    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
+    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
+    server.start(timeout_seconds=600)
+    res = server.make_request("POST", "/infill", data={
+        "input_extra": [{
+            "filename": "llama.h",
+            "text": "LLAMA_API int32_t llama_n_threads();\n"
+        }],
+        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
+        "prompt": "    int n_threads = llama_",
+        "input_suffix": "}\n",
+    })
+    assert res.status_code == 200
+    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"
--- a/examples/server/tests/unit/test_speculative.py
+++ b/examples/server/tests/unit/test_speculative.py
@ -82,6 +82,37 @@ def test_different_draft_min_draft_max():
        last_content = res.body["content"]


+def test_slot_ctx_not_exceeded():
+    global server
+    server.n_ctx = 64
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Hello " * 56,
+        "temperature": 0.0,
+        "top_k": 1,
+        "speculative.p_min": 0.0,
+    })
+    assert res.status_code == 200
+    assert len(res.body["content"]) > 0
+
+
+def test_with_ctx_shift():
+    global server
+    server.n_ctx = 64
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "Hello " * 56,
+        "temperature": 0.0,
+        "top_k": 1,
+        "n_predict": 64,
+        "speculative.p_min": 0.0,
+    })
+    assert res.status_code == 200
+    assert len(res.body["content"]) > 0
+    assert res.body["tokens_predicted"] == 64
+    assert res.body["truncated"] == True
+
+
@pytest.mark.parametrize("n_slots,n_requests", [
    (1, 2),
    (2, 2),
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -64,6 +64,7 @@ class ServerProcess:
    server_embeddings: bool | None = False
    server_reranking: bool | None = False
    server_metrics: bool | None = False
+    server_slots: bool | None = False
    draft: int | None = None
    api_key: str | None = None
    response_format: str | None = None
@ -71,6 +72,7 @@ class ServerProcess:
    disable_ctx_shift: int | None = False
    draft_min: int | None = None
    draft_max: int | None = None
+    no_webui: bool | None = None

    # session variables
    process: subprocess.Popen | None = None
@ -91,7 +93,6 @@ class ServerProcess:
        else:
            server_path = "../../../build/bin/llama-server"
        server_args = [
-            "--slots",  # requires to get slot status via /slots endpoint
            "--host",
            self.server_host,
            "--port",
@ -129,6 +130,8 @@ class ServerProcess:
            server_args.append("--reranking")
        if self.server_metrics:
            server_args.append("--metrics")
+        if self.server_slots:
+            server_args.append("--slots")
        if self.model_alias:
            server_args.extend(["--alias", self.model_alias])
        if self.n_ctx:
@ -156,6 +159,8 @@ class ServerProcess:
            server_args.extend(["--draft-max", self.draft_max])
        if self.draft_min:
            server_args.extend(["--draft-min", self.draft_min])
+        if self.no_webui:
+            server_args.append("--no-webui")

        args = [str(arg) for arg in [server_path, *server_args]]
        print(f"bench: starting server with: {' '.join(args)}")
@ -181,7 +186,7 @@ class ServerProcess:
        start_time = time.time()
        while time.time() - start_time < timeout_seconds:
            try:
-                response = self.make_request("GET", "/slots", headers={
+                response = self.make_request("GET", "/health", headers={
                    "Authorization": f"Bearer {self.api_key}" if self.api_key else None
                })
                if response.status_code == 200:
@ -224,7 +229,7 @@ class ServerProcess:
        result.headers = dict(response.headers)
        result.status_code = response.status_code
        result.body = response.json() if parse_body else None
-        print("Response from server", result.body)
+        print("Response from server", json.dumps(result.body, indent=2))
        return result

    def make_stream_request(
@ -245,7 +250,7 @@ class ServerProcess:
                break
            elif line.startswith('data: '):
                data = json.loads(line[6:])
-                print("Partial response from server", data)
+                print("Partial response from server", json.dumps(data, indent=2))
                yield data


@ -369,3 +374,6 @@ def match_regex(regex: str, text: str) -> bool:
        ).search(text)
        is not None
    )
+
+def is_slow_test_allowed():
+    return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -20,6 +20,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <memory>

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)

-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
-enum error_type {
-    ERROR_TYPE_INVALID_REQUEST,
-    ERROR_TYPE_AUTHENTICATION,
-    ERROR_TYPE_SERVER,
-    ERROR_TYPE_NOT_FOUND,
-    ERROR_TYPE_PERMISSION,
-    ERROR_TYPE_UNAVAILABLE, // custom error
-    ERROR_TYPE_NOT_SUPPORTED, // custom error
-};
-
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
@ -174,6 +164,9 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
    } else {
        throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
    }
+    if (result.empty()) {
+        throw std::runtime_error("\"prompt\" must not be empty");
+    }
    return result;
 }

@ -337,12 +330,12 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
    std::string template_key = "tokenizer.chat_template";
    // call with NULL buffer to get the total size of the string
    int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
-    if (res < 0) {
+    if (res < 2) {
        return "";
    } else {
-        std::vector<char> model_template(res, 0);
+        std::vector<char> model_template(res + 1, 0);
        llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-        return std::string(model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
    }
 }

@ -485,48 +478,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
    return out;
 }

-struct completion_token_output {
-    llama_token tok;
-    std::string text_to_send;
-
-    struct token_prob {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-};
-
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
-    json out = json::array();
-
-    for (const auto & prob : probs) {
-        json probs_for_token = json::array();
-
-        for (const auto & p : prob.probs) {
-            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-
-        const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json {
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-
-    return out;
-}
-
 static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
    const std::string str =
        std::string(event) + ": " +
        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
+        "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).

    LOG_DBG("data stream, to_send: %s", str.c_str());

@ -543,8 +499,6 @@ static json oaicompat_completion_params_parse(
    const std::string & chat_template) {
    json llama_params;

-    llama_params["__oaicompat"] = true;
-
    // Apply chat template to the list of messages
    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));

@ -604,164 +558,6 @@ static json oaicompat_completion_params_parse(
    return llama_params;
 }

-static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
-    bool stopped_word        = result.count("stopped_word") != 0;
-    bool stopped_eos         = json_value(result, "stopped_eos", false);
-    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-    std::string content      = json_value(result, "content", std::string(""));
-
-    std::string finish_reason = "length";
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-
-    json choices =
-        streaming ? json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}})
-                  : json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"message", json{{"content", content},
-                                                         {"role", "assistant"}}}}});
-
-    std::time_t t = std::time(0);
-
-    json res = json {
-        {"choices", choices},
-        {"created", t},
-        {"model",
-            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-        {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-        {"usage", json {
-            {"completion_tokens", num_tokens_predicted},
-            {"prompt_tokens",     num_prompt_tokens},
-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
-        }},
-        {"id", completion_id}
-    };
-
-    // extra fields for debugging purposes
-    if (verbose) {
-        res["__verbose"] = result;
-    }
-
-    if (result.contains("completion_probabilities")) {
-        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-    }
-
-    if (result.contains("timings")) {
-        res.push_back({"timings", json_value(result, "timings", json::object())});
-    }
-
-    return res;
-}
-
-// return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-        return std::vector<json>({result});
-    }
-
-    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-    bool stopped_word   = json_value(result, "stopped_word",  false);
-    bool stopped_eos    = json_value(result, "stopped_eos",   false);
-    bool stopped_limit  = json_value(result, "stopped_limit", false);
-    std::string content = json_value(result, "content",       std::string(""));
-
-    std::string finish_reason;
-    if (stopped_word || stopped_eos) {
-        finish_reason = "stop";
-    }
-    if (stopped_limit) {
-        finish_reason = "length";
-    }
-
-    std::time_t t = std::time(0);
-
-    json choices;
-
-    if (!finish_reason.empty()) {
-        choices = json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}});
-    } else {
-        if (first) {
-            if (content.empty()) {
-                choices = json::array({json{{"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{{"role", "assistant"}}}}});
-            } else {
-                // We have to send this as two updates to conform to openai behavior
-                json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"}
-                                        }}}})},
-                            {"created", t},
-                            {"id", completion_id},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json{
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", completion_id},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                return std::vector<json>({initial_ret, second_ret});
-            }
-        } else {
-            // Some idiosyncrasy in task processing logic makes several trailing calls
-            // with empty content, we ignore these at the calee site.
-            if (content.empty()) {
-                return std::vector<json>({json::object()});
-            }
-
-            choices = json::array({json{
-                {"finish_reason", nullptr},
-                {"index", 0},
-                {"delta",
-                json{
-                    {"content", content},
-                }},
-            }});
-        }
-    }
-
-    json ret = json {
-        {"choices", choices},
-        {"created", t},
-        {"id",      completion_id},
-        {"model",   modelname},
-        {"object",  "chat.completion.chunk"}
-    };
-
-    if (result.contains("timings")) {
-        ret.push_back({"timings", json_value(result, "timings", json::object())});
-    }
-
-    if (!finish_reason.empty()) {
-        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-        ret.push_back({"usage", json {
-            {"completion_tokens", num_tokens_predicted},
-            {"prompt_tokens",     num_prompt_tokens},
-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
-        }});
-    }
-
-    return std::vector<json>({ret});
-}
-
 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
    json data = json::array();
    int i = 0;
@ -854,42 +650,17 @@ static json format_detokenized_response(const std::string & content) {
    };
 }

-static json format_error_response(const std::string & message, const enum error_type type) {
-    std::string type_str;
-    int code = 500;
-    switch (type) {
-        case ERROR_TYPE_INVALID_REQUEST:
-            type_str = "invalid_request_error";
-            code = 400;
-            break;
-        case ERROR_TYPE_AUTHENTICATION:
-            type_str = "authentication_error";
-            code = 401;
-            break;
-        case ERROR_TYPE_NOT_FOUND:
-            type_str = "not_found_error";
-            code = 404;
-            break;
-        case ERROR_TYPE_SERVER:
-            type_str = "server_error";
-            code = 500;
-            break;
-        case ERROR_TYPE_PERMISSION:
-            type_str = "permission_error";
-            code = 403;
-            break;
-        case ERROR_TYPE_NOT_SUPPORTED:
-            type_str = "not_supported_error";
-            code = 501;
-            break;
-        case ERROR_TYPE_UNAVAILABLE:
-            type_str = "unavailable_error";
-            code = 503;
-            break;
+static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
+    json data = json::array();
+    for (const auto & lb : logit_bias) {
+        data.push_back(json{
+            {"bias", lb.bias},
+            {"token", lb.token},
+        });
    }
-    return json {
-        {"code", code},
-        {"message", message},
-        {"type", type_str},
-    };
+    return data;
+}
+
+static std::string safe_json_to_str(json data) {
+    return data.dump(-1, ' ', false, json::error_handler_t::replace);
 }
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@ -0,0 +1,306 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
+  <title>🦙 llama.cpp - chat</title>
+</head>
+
+<body>
+  <div id="app" class="opacity-0"> <!-- opacity-0 will be removed on app mounted -->
+    <div class="flex flex-row drawer lg:drawer-open">
+      <input id="toggle-drawer" type="checkbox" class="drawer-toggle" checked />
+
+      <!-- sidebar -->
+      <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
+        <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
+        <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
+          <div class="flex flex-row items-center justify-between mb-4 mt-4">
+            <h2 class="font-bold ml-4">Conversations</h2>
+
+            <!-- close sidebar button -->
+            <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-arrow-bar-left" viewBox="0 0 16 16">
+                <path fill-rule="evenodd" d="M12.5 15a.5.5 0 0 1-.5-.5v-13a.5.5 0 0 1 1 0v13a.5.5 0 0 1-.5.5M10 8a.5.5 0 0 1-.5.5H3.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L3.707 7.5H9.5a.5.5 0 0 1 .5.5"/>
+              </svg>
+            </label>
+          </div>
+
+          <!-- list of conversations -->
+          <div :class="{
+            'btn btn-ghost justify-start': true,
+            'btn-active': messages.length === 0,
+          }" @click="newConversation">
+            + New conversation
+          </div>
+          <div v-for="conv in conversations" :class="{
+            'btn btn-ghost justify-start font-normal': true,
+            'btn-active': conv.id === viewingConvId,
+          }" @click="setViewingConv(conv.id)">
+            <span class="truncate">{{ conv.messages[0].content }}</span>
+          </div>
+          <div class="text-center text-xs opacity-40 mt-auto mx-4">
+            Conversations are saved to browser's localStorage
+          </div>
+        </div>
+      </div>
+
+      <!-- main view -->
+      <div class="chat-screen drawer-content grow flex flex-col h-screen w-screen mx-auto px-4">
+        <!-- header -->
+        <div class="flex flex-row items-center mt-6 mb-6">
+          <!-- open sidebar button -->
+          <label for="toggle-drawer" class="btn btn-ghost lg:hidden">
+            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-list" viewBox="0 0 16 16">
+              <path fill-rule="evenodd" d="M2.5 12a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5m0-4a.5.5 0 0 1 .5-.5h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5"/>
+            </svg>
+          </label>
+
+          <div class="grow text-2xl font-bold ml-2">llama.cpp</div>
+
+          <!-- action buttons (top right) -->
+          <div class="flex items-center">
+            <div v-if="messages.length > 0" class="dropdown dropdown-end">
+              <!-- "more" button -->
+              <button tabindex="0" role="button" class="btn m-1" :disabled="isGenerating">
+                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-three-dots-vertical" viewBox="0 0 16 16">
+                  <path d="M9.5 13a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0m0-5a1.5 1.5 0 1 1-3 0 1.5 1.5 0 0 1 3 0"/>
+                </svg>
+              </button>
+              <!-- "more" dropdown menu -->
+              <ul tabindex="0" class="dropdown-content menu bg-base-100 rounded-box z-[1] w-52 p-2 shadow">
+                <li @click="downloadConv(viewingConvId)"><a>Download</a></li>
+                <li class="text-error" @click="deleteConv(viewingConvId)"><a>Delete</a></li>
+              </ul>
+            </div>
+            <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
+              <!-- settings button -->
+              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
+                <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
+                <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
+              </svg>
+            </button>
+
+            <!-- theme controller is copied from https://daisyui.com/components/theme-controller/ -->
+            <div class="dropdown dropdown-end dropdown-bottom">
+              <div tabindex="0" role="button" class="btn m-1">
+                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-palette2" viewBox="0 0 16 16">
+                  <path d="M0 .5A.5.5 0 0 1 .5 0h5a.5.5 0 0 1 .5.5v5.277l4.147-4.131a.5.5 0 0 1 .707 0l3.535 3.536a.5.5 0 0 1 0 .708L10.261 10H15.5a.5.5 0 0 1 .5.5v5a.5.5 0 0 1-.5.5H3a3 3 0 0 1-2.121-.879A3 3 0 0 1 0 13.044m6-.21 7.328-7.3-2.829-2.828L6 7.188zM4.5 13a1.5 1.5 0 1 0-3 0 1.5 1.5 0 0 0 3 0M15 15v-4H9.258l-4.015 4zM0 .5v12.495zm0 12.495V13z"/>
+                </svg>
+              </div>
+              <ul tabindex="0" class="dropdown-content bg-base-300 rounded-box z-[1] w-52 p-2 shadow-2xl h-80 overflow-y-auto">
+                <li>
+                  <button
+                    class="btn btn-sm btn-block btn-ghost justify-start"
+                    :class="{ 'btn-active': selectedTheme === 'auto' }"
+                    @click="setSelectedTheme('auto')">
+                    auto
+                  </button>
+                </li>
+                <li v-for="theme in themes">
+                  <input
+                    type="radio"
+                    name="theme-dropdown"
+                    class="theme-controller btn btn-sm btn-block btn-ghost justify-start"
+                    :aria-label="theme"
+                    :value="theme"
+                    :checked="selectedTheme === theme"
+                    @click="setSelectedTheme(theme)" />
+                </li>
+              </ul>
+            </div>
+          </div>
+        </div>
+
+        <!-- chat messages -->
+        <div id="messages-list" class="flex flex-col grow overflow-y-auto">
+          <div class="mt-auto flex justify-center">
+            <!-- placeholder to shift the message to the bottom -->
+            {{ messages.length === 0 ? 'Send a message to start' : '' }}
+          </div>
+          <div v-for="msg in messages" class="group">
+            <message-bubble
+              :config="config"
+              :msg="msg"
+              :key="msg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
+              :regenerate-msg="regenerateMsg"></message-bubble>
+          </div>
+
+          <!-- pending (ongoing) assistant message -->
+          <div id="pending-msg" class="group">
+            <message-bubble
+              v-if="pendingMsg"
+              :config="config"
+              :msg="pendingMsg"
+              :key="pendingMsg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="() => {}"
+              :regenerate-msg="() => {}"></message-bubble>
+          </div>
+        </div>
+
+        <!-- chat input -->
+        <div class="flex flex-row items-center mt-8 mb-6">
+          <textarea
+            class="textarea textarea-bordered w-full"
+            placeholder="Type a message (Shift+Enter to add a new line)"
+            v-model="inputMsg"
+            @keydown.enter.exact.prevent="sendMessage"
+            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
+            :disabled="isGenerating"
+            id="msg-input"
+          ></textarea>
+          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
+          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
+        </div>
+      </div>
+
+    </div>
+
+
+    <!-- modal for editing config -->
+    <dialog class="modal" :class="{'modal-open': showConfigDialog}">
+      <div class="modal-box">
+        <h3 class="text-lg font-bold mb-6">Settings</h3>
+        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
+          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
+          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
+          <label class="form-control mb-2">
+            <div class="label">System Message</div>
+            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
+          </label>
+          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
+            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
+          </template>
+          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
+          <!-- Section: Other sampler settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Other sampler settings</summary>
+            <div class="collapse-content">
+              <!-- Samplers queue -->
+              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
+              <!-- Samplers -->
+              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
+              </template>
+            </div>
+          </details>
+          <!-- Section: Penalties settings -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Penalties settings</summary>
+            <div class="collapse-content">
+              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
+                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]"></settings-modal-short-input>
+              </template>
+            </div>
+          </details>
+          <!-- Section: Advanced config -->
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
+            <summary class="collapse-title font-bold">Advanced config</summary>
+            <div class="collapse-content">
+              <div class="flex flex-row items-center mb-2">
+                <input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
+                <span class="ml-4">Show tokens per second</span>
+              </div>
+              <label class="form-control mb-2">
+                <!-- Custom parameters input -->
+                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
+                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
+              </label>
+            </div>
+          </details>
+        </div>
+
+        <!-- action buttons -->
+        <div class="modal-action">
+          <button class="btn" @click="resetConfigDialog">Reset to default</button>
+          <button class="btn" @click="closeAndDiscardConfigDialog">Close</button>
+          <button class="btn btn-primary" @click="closeAndSaveConfigDialog">Save</button>
+        </div>
+      </div>
+    </dialog>
+
+  </div>
+
+
+  <!-- Template to be used as message bubble -->
+  <template id="message-bubble">
+    <div :class="{
+      'chat': true,
+      'chat-start': msg.role !== 'user',
+      'chat-end': msg.role === 'user',
+    }">
+      <div :class="{
+        'chat-bubble markdown': true,
+        'chat-bubble-base-300': msg.role !== 'user',
+      }">
+        <!-- textarea for editing message -->
+        <template v-if="editingContent !== null">
+          <textarea
+            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+            v-model="editingContent"></textarea>
+          <br/>
+          <button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
+          <button class="btn mt-2" @click="editMsg()">Submit</button>
+        </template>
+        <template v-else>
+          <!-- show loading dots for pending message -->
+          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
+          <!-- render message as markdown -->
+          <vue-markdown v-else :source="msg.content"></vue-markdown>
+          <!-- render timings if enabled -->
+          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
+            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
+            <div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+              <b>Prompt</b><br/>
+              - Tokens: {{ timings.prompt_n }}<br/>
+              - Time: {{ timings.prompt_ms }} ms<br/>
+              - Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
+              <b>Generation</b><br/>
+              - Tokens: {{ timings.predicted_n }}<br/>
+              - Time: {{ timings.predicted_ms }} ms<br/>
+              - Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
+            </div>
+          </div>
+        </template>
+      </div>
+    </div>
+    <!-- actions for each message -->
+    <div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
+      <!-- user message -->
+      <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
+        ✍️ Edit
+      </button>
+      <!-- assistant message -->
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+        🔄 Regenerate
+      </button>
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
+        📋 Copy
+      </button>
+    </div>
+  </template>
+
+
+  <!-- Template to be used by settings modal -->
+  <template id="settings-modal-short-input">
+    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
+      <!-- Show help message on hovering on the input label -->
+      <div class="dropdown dropdown-hover">
+        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
+        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
+          {{ configInfo[configKey] || '(no help message available)' }}
+        </div>
+      </div>
+      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
+      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
+    </label>
+  </template>
+
+  <script type="module" src="/src/main.js"></script>
+</body>
+
+</html>
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@ -0,0 +1,24 @@
+{
+  "name": "webui",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "preview": "vite preview"
+  },
+  "devDependencies": {
+    "vite": "^5.4.10"
+  },
+  "dependencies": {
+    "autoprefixer": "^10.4.20",
+    "daisyui": "^4.12.14",
+    "markdown-it": "^14.1.0",
+    "postcss": "^8.4.49",
+    "tailwindcss": "^3.4.15",
+    "textlinestream": "^1.1.1",
+    "vite-plugin-singlefile": "^2.0.3",
+    "vue": "^3.5.13"
+  }
+}
--- a/examples/server/webui/postcss.config.js
+++ b/examples/server/webui/postcss.config.js
@ -0,0 +1,6 @@
+export default {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@ -0,0 +1,528 @@
+import './styles.css';
+import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
+import MarkdownIt from 'markdown-it';
+import TextLineStream from 'textlinestream';
+
+const isDev = import.meta.env.MODE === 'development';
+
+// utility functions
+const isString = (x) => !!x.toLowerCase;
+const isBoolean = (x) => x === true || x === false;
+const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
+const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
+const copyStr = (str) => navigator.clipboard.writeText(str);
+
+// constants
+const BASE_URL = localStorage.getItem('base') // for debugging
+  || (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
+const CONFIG_DEFAULT = {
+  // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
+  apiKey: '',
+  systemMessage: 'You are a helpful assistant.',
+  showTokensPerSecond: false,
+  // make sure these default values are in sync with `common.h`
+  samplers: 'dkypmxt',
+  temperature: 0.8,
+  dynatemp_range: 0.0,
+  dynatemp_exponent: 1.0,
+  top_k: 40,
+  top_p: 0.95,
+  min_p: 0.05,
+  xtc_probability: 0.0,
+  xtc_threshold: 0.1,
+  typical_p: 1.0,
+  repeat_last_n: 64,
+  repeat_penalty: 1.0,
+  presence_penalty: 0.0,
+  frequency_penalty: 0.0,
+  dry_multiplier: 0.0,
+  dry_base: 1.75,
+  dry_allowed_length: 2,
+  dry_penalty_last_n: -1,
+  max_tokens: -1,
+  custom: '', // custom json-stringified object
+};
+const CONFIG_INFO = {
+  apiKey: 'Set the API Key if you are using --api-key option for the server.',
+  systemMessage: 'The starting message that defines how model should behave.',
+  samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
+  temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
+  dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
+  dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
+  top_k: 'Keeps only k top tokens.',
+  top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
+  min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
+  xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
+  xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
+  typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
+  repeat_last_n: 'Last n tokens to consider for penalizing repetition',
+  repeat_penalty: 'Controls the repetition of token sequences in the generated text',
+  presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
+  frequency_penalty: 'Limits tokens based on how often they appear in the output.',
+  dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
+  dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
+  dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
+  dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
+  max_tokens: 'The maximum number of token per output.',
+  custom: '', // custom json-stringified object
+};
+// config keys having numeric value (i.e. temperature, top_k, top_p, etc)
+const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
+// list of themes supported by daisyui
+const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'];
+
+// markdown support
+const VueMarkdown = defineComponent(
+  (props) => {
+    const md = shallowRef(new MarkdownIt({ breaks: true }));
+    const origFenchRenderer = md.value.renderer.rules.fence;
+    md.value.renderer.rules.fence = (tokens, idx, ...args) => {
+      const content = tokens[idx].content;
+      const origRendered = origFenchRenderer(tokens, idx, ...args);
+      return `<div class="relative my-4">
+        <div class="text-right sticky top-4 mb-2 mr-2 h-0">
+          <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
+        </div>
+        ${origRendered}
+      </div>`;
+    };
+    window.copyStr = copyStr;
+    const content = computed(() => md.value.render(props.source));
+    return () => h("div", { innerHTML: content.value });
+  },
+  { props: ["source"] }
+);
+
+// input field to be used by settings modal
+const SettingsModalShortInput = defineComponent({
+  template: document.getElementById('settings-modal-short-input').innerHTML,
+  props: {
+    label: { type: String, required: false },
+    configKey: String,
+    configDefault: Object,
+    configInfo: Object,
+    modelValue: [Object, String, Number],
+  },
+});
+
+// message bubble component
+const MessageBubble = defineComponent({
+  components: {
+    VueMarkdown
+  },
+  template: document.getElementById('message-bubble').innerHTML,
+  props: {
+    config: Object,
+    msg: Object,
+    isGenerating: Boolean,
+    editUserMsgAndRegenerate: Function,
+    regenerateMsg: Function,
+  },
+  data() {
+    return {
+      editingContent: null,
+    };
+  },
+  computed: {
+    timings() {
+      if (!this.msg.timings) return null;
+      return {
+        ...this.msg.timings,
+        prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
+        predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
+      };
+    }
+  },
+  methods: {
+    copyMsg() {
+      copyStr(this.msg.content);
+    },
+    editMsg() {
+      this.editUserMsgAndRegenerate({
+        ...this.msg,
+        content: this.editingContent,
+      });
+      this.editingContent = null;
+    },
+  },
+});
+
+// coversations is stored in localStorage
+// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
+// convId is a string prefixed with 'conv-'
+const StorageUtils = {
+  // manage conversations
+  getAllConversations() {
+    const res = [];
+    for (const key in localStorage) {
+      if (key.startsWith('conv-')) {
+        res.push(JSON.parse(localStorage.getItem(key)));
+      }
+    }
+    res.sort((a, b) => b.lastModified - a.lastModified);
+    return res;
+  },
+  // can return null if convId does not exist
+  getOneConversation(convId) {
+    return JSON.parse(localStorage.getItem(convId) || 'null');
+  },
+  // if convId does not exist, create one
+  appendMsg(convId, msg) {
+    if (msg.content === null) return;
+    const conv = StorageUtils.getOneConversation(convId) || {
+      id: convId,
+      lastModified: Date.now(),
+      messages: [],
+    };
+    conv.messages.push(msg);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+  },
+  getNewConvId() {
+    return `conv-${Date.now()}`;
+  },
+  remove(convId) {
+    localStorage.removeItem(convId);
+  },
+  filterAndKeepMsgs(convId, predicate) {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    conv.messages = conv.messages.filter(predicate);
+    conv.lastModified = Date.now();
+    localStorage.setItem(convId, JSON.stringify(conv));
+  },
+  popMsg(convId) {
+    const conv = StorageUtils.getOneConversation(convId);
+    if (!conv) return;
+    const msg = conv.messages.pop();
+    conv.lastModified = Date.now();
+    if (conv.messages.length === 0) {
+      StorageUtils.remove(convId);
+    } else {
+      localStorage.setItem(convId, JSON.stringify(conv));
+    }
+    return msg;
+  },
+
+  // manage config
+  getConfig() {
+    const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
+    // to prevent breaking changes in the future, we always provide default value for missing keys
+    return {
+      ...CONFIG_DEFAULT,
+      ...savedVal,
+    };
+  },
+  setConfig(config) {
+    localStorage.setItem('config', JSON.stringify(config));
+  },
+  getTheme() {
+    return localStorage.getItem('theme') || 'auto';
+  },
+  setTheme(theme) {
+    if (theme === 'auto') {
+      localStorage.removeItem('theme');
+    } else {
+      localStorage.setItem('theme', theme);
+    }
+  },
+};
+
+// scroll to bottom of chat messages
+// if requiresNearBottom is true, only auto-scroll if user is near bottom
+const chatScrollToBottom = (requiresNearBottom) => {
+  const msgListElem = document.getElementById('messages-list');
+  const spaceToBottom = msgListElem.scrollHeight - msgListElem.scrollTop - msgListElem.clientHeight;
+  if (!requiresNearBottom || (spaceToBottom < 100)) {
+    setTimeout(() => msgListElem.scrollTo({ top: msgListElem.scrollHeight }), 1);
+  }
+};
+
+// wrapper for SSE
+async function* sendSSEPostRequest(url, fetchOptions) {
+  const res = await fetch(url, fetchOptions);
+  const lines = res.body
+    .pipeThrough(new TextDecoderStream())
+    .pipeThrough(new TextLineStream());
+  for await (const line of lines) {
+    if (isDev) console.log({line});
+    if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
+      const data = JSON.parse(line.slice(5));
+      yield data;
+    } else if (line.startsWith('error:')) {
+      const data = JSON.parse(line.slice(6));
+      throw new Error(data.message || 'Unknown error');
+    }
+  }
+};
+
+const mainApp = createApp({
+  components: {
+    VueMarkdown,
+    SettingsModalShortInput,
+    MessageBubble,
+  },
+  data() {
+    return {
+      conversations: StorageUtils.getAllConversations(),
+      messages: [], // { id: number, role: 'user' | 'assistant', content: string }
+      viewingConvId: StorageUtils.getNewConvId(),
+      inputMsg: '',
+      isGenerating: false,
+      pendingMsg: null, // the on-going message from assistant
+      stopGeneration: () => {},
+      selectedTheme: StorageUtils.getTheme(),
+      config: StorageUtils.getConfig(),
+      showConfigDialog: false,
+      // const
+      themes: THEMES,
+      configDefault: {...CONFIG_DEFAULT},
+      configInfo: {...CONFIG_INFO},
+    }
+  },
+  computed: {},
+  mounted() {
+    document.getElementById('app').classList.remove('opacity-0'); // show app
+    // scroll to the bottom when the pending message height is updated
+    const pendingMsgElem = document.getElementById('pending-msg');
+    const resizeObserver = new ResizeObserver(() => {
+      if (this.isGenerating) chatScrollToBottom(true);
+    });
+    resizeObserver.observe(pendingMsgElem);
+  },
+  watch: {
+    viewingConvId: function(val, oldVal) {
+      if (val != oldVal) {
+        this.fetchMessages();
+        chatScrollToBottom();
+        this.hideSidebar();
+      }
+    }
+  },
+  methods: {
+    hideSidebar() {
+      document.getElementById('toggle-drawer').checked = false;
+    },
+    setSelectedTheme(theme) {
+      this.selectedTheme = theme;
+      StorageUtils.setTheme(theme);
+    },
+    newConversation() {
+      if (this.isGenerating) return;
+      this.viewingConvId = StorageUtils.getNewConvId();
+    },
+    setViewingConv(convId) {
+      if (this.isGenerating) return;
+      this.viewingConvId = convId;
+    },
+    deleteConv(convId) {
+      if (this.isGenerating) return;
+      if (window.confirm('Are you sure to delete this conversation?')) {
+        StorageUtils.remove(convId);
+        if (this.viewingConvId === convId) {
+          this.viewingConvId = StorageUtils.getNewConvId();
+        }
+        this.fetchConversation();
+        this.fetchMessages();
+      }
+    },
+    downloadConv(convId) {
+      const conversation = StorageUtils.getOneConversation(convId);
+      if (!conversation) {
+        alert('Conversation not found.');
+        return;
+      }
+      const conversationJson = JSON.stringify(conversation, null, 2);
+      const blob = new Blob([conversationJson], { type: 'application/json' });
+      const url = URL.createObjectURL(blob);
+      const a = document.createElement('a');
+      a.href = url;
+      a.download = `conversation_${convId}.json`;
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+    },
+    async sendMessage() {
+      if (!this.inputMsg) return;
+      const currConvId = this.viewingConvId;
+
+      StorageUtils.appendMsg(currConvId, {
+        id: Date.now(),
+        role: 'user',
+        content: this.inputMsg,
+      });
+      this.fetchConversation();
+      this.fetchMessages();
+      this.inputMsg = '';
+      this.generateMessage(currConvId);
+      chatScrollToBottom();
+    },
+    async generateMessage(currConvId) {
+      if (this.isGenerating) return;
+      this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
+      this.isGenerating = true;
+
+      try {
+        const abortController = new AbortController();
+        this.stopGeneration = () => abortController.abort();
+        const params = {
+          messages: [
+            { role: 'system', content: this.config.systemMessage },
+            ...this.messages,
+          ],
+          stream: true,
+          cache_prompt: true,
+          samplers: this.config.samplers,
+          temperature: this.config.temperature,
+          dynatemp_range: this.config.dynatemp_range,
+          dynatemp_exponent: this.config.dynatemp_exponent,
+          top_k: this.config.top_k,
+          top_p: this.config.top_p,
+          min_p: this.config.min_p,
+          typical_p: this.config.typical_p,
+          xtc_probability: this.config.xtc_probability,
+          xtc_threshold: this.config.xtc_threshold,
+          repeat_last_n: this.config.repeat_last_n,
+          repeat_penalty: this.config.repeat_penalty,
+          presence_penalty: this.config.presence_penalty,
+          frequency_penalty: this.config.frequency_penalty,
+          dry_multiplier: this.config.dry_multiplier,
+          dry_base: this.config.dry_base,
+          dry_allowed_length: this.config.dry_allowed_length,
+          dry_penalty_last_n: this.config.dry_penalty_last_n,
+          max_tokens: this.config.max_tokens,
+          timings_per_token: !!this.config.showTokensPerSecond,
+          ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
+        };
+        const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
+          method: 'POST',
+          headers: {
+            'Content-Type': 'application/json',
+            'Authorization': this.config.apiKey ? `Bearer ${this.config.apiKey}` : undefined,
+          },
+          body: JSON.stringify(params),
+          signal: abortController.signal,
+        });
+        for await (const chunk of chunks) {
+          const stop = chunk.stop;
+          const addedContent = chunk.choices[0].delta.content;
+          const lastContent = this.pendingMsg.content || '';
+          if (addedContent) {
+            this.pendingMsg = {
+              id: this.pendingMsg.id,
+              role: 'assistant',
+              content: lastContent + addedContent,
+            };
+          }
+          const timings = chunk.timings;
+          if (timings && this.config.showTokensPerSecond) {
+            // only extract what's really needed, to save some space
+            this.pendingMsg.timings = {
+              prompt_n: timings.prompt_n,
+              prompt_ms: timings.prompt_ms,
+              predicted_n: timings.predicted_n,
+              predicted_ms: timings.predicted_ms,
+            };
+          }
+        }
+
+        StorageUtils.appendMsg(currConvId, this.pendingMsg);
+        this.fetchConversation();
+        this.fetchMessages();
+        setTimeout(() => document.getElementById('msg-input').focus(), 1);
+      } catch (error) {
+        if (error.name === 'AbortError') {
+          // user stopped the generation via stopGeneration() function
+          StorageUtils.appendMsg(currConvId, this.pendingMsg);
+          this.fetchConversation();
+          this.fetchMessages();
+        } else {
+          console.error(error);
+          alert(error);
+          // pop last user message
+          const lastUserMsg = StorageUtils.popMsg(currConvId);
+          this.inputMsg = lastUserMsg ? lastUserMsg.content : '';
+        }
+      }
+
+      this.pendingMsg = null;
+      this.isGenerating = false;
+      this.stopGeneration = () => {};
+      this.fetchMessages();
+      chatScrollToBottom();
+    },
+
+    // message actions
+    regenerateMsg(msg) {
+      if (this.isGenerating) return;
+      // TODO: somehow keep old history (like how ChatGPT has different "tree"). This can be done by adding "sub-conversations" with "subconv-" prefix, and new message will have a list of subconvIds
+      const currConvId = this.viewingConvId;
+      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+      this.fetchConversation();
+      this.fetchMessages();
+      this.generateMessage(currConvId);
+    },
+    editUserMsgAndRegenerate(msg) {
+      if (this.isGenerating) return;
+      const currConvId = this.viewingConvId;
+      const newContent = msg.content;
+      StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
+      StorageUtils.appendMsg(currConvId, {
+        id: Date.now(),
+        role: 'user',
+        content: newContent,
+      });
+      this.fetchConversation();
+      this.fetchMessages();
+      this.generateMessage(currConvId);
+    },
+
+    // settings dialog methods
+    closeAndSaveConfigDialog() {
+      try {
+        if (this.config.custom.length) JSON.parse(this.config.custom);
+      } catch (error) {
+        alert('Invalid JSON for custom config. Please either fix it or leave it empty.');
+        return;
+      }
+      for (const key of CONFIG_NUMERIC_KEYS) {
+        if (isNaN(this.config[key]) || this.config[key].toString().trim().length === 0) {
+          alert(`Invalid number for ${key} (expected an integer or a float)`);
+          return;
+        }
+        this.config[key] = parseFloat(this.config[key]);
+      }
+      this.showConfigDialog = false;
+      StorageUtils.setConfig(this.config);
+    },
+    closeAndDiscardConfigDialog() {
+      this.showConfigDialog = false;
+      this.config = StorageUtils.getConfig();
+    },
+    resetConfigDialog() {
+      if (window.confirm('Are you sure to reset all settings?')) {
+        this.config = {...CONFIG_DEFAULT};
+      }
+    },
+
+    // sync state functions
+    fetchConversation() {
+      this.conversations = StorageUtils.getAllConversations();
+    },
+    fetchMessages() {
+      this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? [];
+    },
+  },
+});
+mainApp.config.errorHandler = alert;
+try {
+  mainApp.mount('#app');
+} catch (err) {
+  console.error(err);
+  document.getElementById('app').innerHTML = `<div style="margin:2em auto">
+    Failed to start app. Please try clearing localStorage and try again.<br/>
+    <br/>
+    <button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
+  </div>`;
+}
--- a/examples/server/webui/src/styles.css
+++ b/examples/server/webui/src/styles.css
@ -0,0 +1,26 @@
+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+
+.markdown {
+  h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
+  pre {
+    @apply whitespace-pre-wrap rounded-lg p-2;
+    border: 1px solid currentColor;
+  }
+  /* TODO: fix markdown table */
+}
+
+.show-on-hover {
+  @apply md:opacity-0 md:group-hover:opacity-100;
+}
+.btn-mini {
+  @apply cursor-pointer hover:shadow-md;
+}
+.chat-screen { max-width: 900px; }
+
+.chat-bubble-base-300 {
+  --tw-bg-opacity: 1;
+  --tw-text-opacity: 1;
+  @apply bg-base-300 text-base-content;
+}
--- a/examples/server/webui/tailwind.config.js
+++ b/examples/server/webui/tailwind.config.js
@ -0,0 +1,16 @@
+/** @type {import('tailwindcss').Config} */
+export default {
+  content: [
+    "./index.html",
+    "./src/**/*.{js,ts,jsx,tsx}",
+  ],
+  theme: {
+    extend: {},
+  },
+  plugins: [
+    require('daisyui'),
+  ],
+  daisyui: {
+    themes: ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset'],
+  }
+}
--- a/examples/server/webui/vite.config.js
+++ b/examples/server/webui/vite.config.js
@ -0,0 +1,36 @@
+
+import { viteSingleFile } from 'vite-plugin-singlefile';
+import path from 'path';
+import fs from 'fs';
+
+const GUIDE_FOR_FRONTEND = `
+<!--
+  This is a single file build of the frontend.
+  It is automatically generated by the build process.
+  Do not edit this file directly.
+  To make changes, refer to the "Web UI" section in the README.
+-->
+`.trim();
+
+export default {
+  plugins: [
+    viteSingleFile(),
+    (function llamaCppPlugin() {
+      let config;
+      return {
+        name: 'llamacpp:build',
+        apply: 'build',
+        async configResolved(_config) {
+          config = _config;
+        },
+        writeBundle() {
+          const outputIndexHtml = path.join(config.build.outDir, 'index.html');
+          const content = fs.readFileSync(outputIndexHtml, 'utf-8');
+
+          const targetOutputFile = path.join(config.build.outDir, '../../public/index.html');
+          fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content);
+        }
+      }
+    })(),
+  ],
+};
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
    }

    if (show_token_count) {
-        printf("Total number of tokens: %ld\n", tokens.size());
+        printf("Total number of tokens: %zu\n", tokens.size());
    }
    // silence valgrind
    llama_free(ctx);
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -32,6 +32,13 @@ else()
    endif()
 endif()

+# remove the lib prefix on win32 mingw
+if (WIN32)
+    set(CMAKE_STATIC_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_LIBRARY_PREFIX "")
+    set(CMAKE_SHARED_MODULE_PREFIX  "")
+endif()
+
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)

@ -92,30 +99,33 @@ else()
    set(INS_ENB ON)
 endif()

-option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
-option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
-
-option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
-option(GGML_AVX_VNNI    "ggml: enable AVX-VNNI"         OFF)
-option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
-option(GGML_AVX512      "ggml: enable AVX512"           OFF)
-option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
-option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
-option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
-option(GGML_AMX_TILE    "ggml: enable AMX-TILE"         OFF)
-option(GGML_AMX_INT8    "ggml: enable AMX-INT8"         OFF)
-option(GGML_AMX_BF16    "ggml: enable AMX-BF16"         OFF)
-option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
+option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
+option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
+option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
+option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
+option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
+option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
+option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
 if (NOT MSVC)
-    option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
+    # in MSVC F16C and FMA is implied with AVX2/AVX512
+    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
+    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
+    # MSVC does not seem to support AMX
+    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
+    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
+    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
 endif()
-option(GGML_LASX        "ggml: enable lasx"             ON)
-option(GGML_LSX         "ggml: enable lsx"              ON)
-option(GGML_RVV         "ggml: enable rvv"              ON)
-option(GGML_SVE         "ggml: enable SVE"              OFF)
+option(GGML_LASX             "ggml: enable lasx"             ON)
+option(GGML_LSX              "ggml: enable lsx"              ON)
+option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_SVE              "ggml: enable SVE"              OFF)
+option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+

 if (WIN32)
-    set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
+    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
 endif()

 # ggml core
@ -170,6 +180,11 @@ set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                            "ggml: sycl device architecture")
 option(GGML_QNN                             "ggml: use QNN"                                   OFF)

+option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
+option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
+option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
+option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
+
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@ -181,11 +196,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)

-if (GGML_SYCL OR GGML_QNN)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
-endif()
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)

 set(THREADS_PREFER_PTHREAD_FLAG ON)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -228,6 +228,7 @@ extern "C" {
    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
    // Load all known backends from dynamic libraries
    GGML_API void               ggml_backend_load_all(void);
+    GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);

    //
    // Backend scheduler
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -103,24 +103,14 @@ extern "C" {

    // Internal types and functions exposed for tests and benchmarks

-    typedef void (*ggml_from_float_to_mat_t)
-                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
-    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                       const void * GGML_RESTRICT y, int nr, int nc);
-    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
-                                       const void * GGML_RESTRICT y, int nr, int nc);

    struct ggml_type_traits_cpu {
        ggml_from_float_t        from_float;
-        ggml_from_float_to_mat_t from_float_to_mat;
        ggml_vec_dot_t           vec_dot;
        enum ggml_type           vec_dot_type;
        int64_t                  nrows; // number of rows to process simultaneously
-        int64_t                  ncols; // number of columns to process simultaneously
-        ggml_gemv_t              gemv;
-        ggml_gemm_t              gemm;
    };

    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
@ -140,13 +130,6 @@ extern "C" {

    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);

-#ifdef GGML_USE_CPU_HBM
-    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
-    GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
-
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml-opencl.h
+++ b/ggml/include/ggml-opencl.h
@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -384,15 +384,15 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
-        GGML_TYPE_Q4_0_4_4 = 31,
-        GGML_TYPE_Q4_0_4_8 = 32,
-        GGML_TYPE_Q4_0_8_8 = 33,
+        // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // GGML_TYPE_Q4_0_4_8 = 32,
+        // GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_TQ1_0   = 34,
        GGML_TYPE_TQ2_0   = 35,
-        GGML_TYPE_IQ4_NL_4_4 = 36,
+        // GGML_TYPE_IQ4_NL_4_4 = 36,
        // GGML_TYPE_IQ4_NL_4_8 = 37,
        // GGML_TYPE_IQ4_NL_8_8 = 38,
-        GGML_TYPE_COUNT,
+        GGML_TYPE_COUNT   = 39,
    };

    // precision
@ -433,9 +433,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };

    // available tensor operations:
@ -499,6 +496,7 @@ extern "C" {
        GGML_OP_POOL_2D_BACK,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
+        GGML_OP_PAD_REFLECT_1D,
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
@ -1695,6 +1693,13 @@ extern "C" {
            int                  p2,
            int                  p3);

+    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
+    GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   p0,
+            int                   p1);
+
    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
    // return: [N, dim]
@ -2197,11 +2202,19 @@ extern "C" {
    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);

-#ifdef  __cplusplus
-// restrict not standard in C++
-#define GGML_RESTRICT
+#ifdef __cplusplus
+    // restrict not standard in C++
+#    if defined(__GNUC__)
+#        define GGML_RESTRICT __restrict__
+#    elif defined(__clang__)
+#        define GGML_RESTRICT __restrict
+#    elif defined(_MSC_VER)
+#        define GGML_RESTRICT __restrict
+#    else
+#        define GGML_RESTRICT
+#    endif
 #else
-#define GGML_RESTRICT restrict
+#    define GGML_RESTRICT restrict
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -194,11 +194,6 @@ endif()

 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-
-    if (BUILD_SHARED_LIBS)
-        # TODO: should not use this
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
 endif()

 # ggml
@ -220,9 +215,7 @@ add_library(ggml-base
            ggml-threading.cpp
            ggml-threading.h
            ggml-quants.c
-            ggml-quants.h
-            ggml-aarch64.c
-            ggml-aarch64.h)
+            ggml-quants.h)

 target_include_directories(ggml-base PRIVATE .)

@ -269,7 +262,42 @@ function(ggml_add_backend backend)
    endif()
 endfunction()

+function(ggml_add_cpu_backend_variant tag_name)
+    set(GGML_CPU_TAG_NAME ${tag_name})
+    # other: OPENMP LLAMAFILE CPU_HBM
+    foreach (feat NATIVE
+                  AVX AVX2 AVX_VNNI FMA F16C
+                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
+                  AMX_TILE AMX_INT8 AMX_BF16)
+        set(GGML_${feat} OFF)
+    endforeach()
+
+    foreach (feat ${ARGN})
+        set(GGML_${feat} ON)
+    endforeach()
+
+    ggml_add_cpu_backend_variant_impl(${tag_name})
+endfunction()
+
 ggml_add_backend(CPU)
+
+if (GGML_CPU_ALL_VARIANTS)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+    endif()
+    ggml_add_cpu_backend_variant(sandybridge    AVX)
+    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    if (NOT MSVC)
+        # MSVC doesn't support AVX-VNNI or AMX
+        ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+    endif()
+else ()
+    ggml_add_cpu_backend_variant_impl("")
+endif()
+
 ggml_add_backend(BLAS)
 ggml_add_backend(CANN)
 ggml_add_backend(CUDA)
@ -280,6 +308,7 @@ ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(OpenCL)
 ggml_add_backend(QNN)

 foreach (target ggml-base ggml)
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -1,129 +0,0 @@
-#define GGML_COMMON_DECL_C
-#include "ggml-common.h"
-
-#include "ggml-aarch64.h"
-#include "ggml-impl.h"
-#include "ggml-quants.h"
-#include <assert.h>
-
-#define UNUSED GGML_UNUSED
-
-static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
-    block_q4_0x4 out;
-
-    for (int i = 0; i < 4; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_0 * 2 / blck_size_interleave;
-
-    if (blck_size_interleave == 8) {
-        const uint64_t xor_mask = 0x8888888888888888ULL;
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 4;
-            int src_offset = (i / 4) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            uint64_t elems;
-            // Using memcpy to avoid unaligned memory accesses
-            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-            elems ^= xor_mask;
-            memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
-        }
-    } else if (blck_size_interleave == 4) {
-        const uint32_t xor_mask = 0x88888888;
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 4;
-            int src_offset = (i / 4) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
-
-            uint32_t elems;
-            memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
-            elems ^= xor_mask;
-            memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    return out;
-}
-
-// interleave 8 block_q4_0s in blocks of blck_size_interleave
-// returns an interleaved block_q4_0x8
-// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
-// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
-static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
-    block_q4_0x8 out;
-
-    for (int i = 0; i < 8; i++) {
-        out.d[i] = in[i].d;
-    }
-
-    const int end = QK4_0 * 4 / blck_size_interleave;
-    const uint64_t xor_mask = 0x8888888888888888ULL;
-
-    for (int i = 0; i < end; ++i) {
-        int src_id = i % 8;
-        int src_offset = (i / 8) * blck_size_interleave;
-        int dst_offset = i * blck_size_interleave;
-
-        uint64_t elems;
-        memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
-        elems ^= xor_mask;
-        memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
-    }
-
-    return out;
-}
-
-static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
-    assert(n_per_row % QK4_0 == 0);
-    const int nb = n_per_row / QK4_0;
-
-    void * out_ptr = NULL;
-    if (nrows_interleaved == 8) {
-        out_ptr = (block_q4_0x8 *) dst;
-    }
-    else if (nrows_interleaved == 4) {
-        out_ptr = (block_q4_0x4 *) dst;
-    }
-    assert(nrows_interleaved <= 8);
-    block_q4_0 dst_tmp[8];
-
-    for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
-
-        for (int64_t x = 0; x < nb; x++) {
-
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
-                quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
-            }
-
-            if (nrows_interleaved == 8) {
-                *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
-                out_ptr = (block_q4_0x8 *) out_ptr + 1;
-            }
-            else if (nrows_interleaved == 4) {
-                *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
-                out_ptr = (block_q4_0x4 *) out_ptr + 1;
-            }
-        }
-    }
-
-    return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
-}
-
-size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    UNUSED(quant_weights);
-    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
-}
-
-size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    UNUSED(quant_weights);
-    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
-}
-
-size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
-    UNUSED(quant_weights);
-    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
-}
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@ -1,19 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-
-#ifdef __cplusplus
-}
-#endif
-
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -46,6 +46,10 @@
 #include "ggml-vulkan.h"
 #endif

+#ifdef GGML_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@ -150,6 +154,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
        register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_OPENCL
+        register_backend(ggml_backend_opencl_reg());
+#endif
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
@ -456,11 +463,21 @@ static std::string backend_filename_suffix() {
 #endif
 }

-static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
+static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
     // TODO: search system paths
-    std::vector<std::string> search_paths = { "./", get_executable_path() };
    std::string file_prefix = backend_filename_prefix() + name + "-";
+    std::vector<std::string> search_paths;
+    if (user_search_path == nullptr) {
+        search_paths.push_back("./");
+        search_paths.push_back(get_executable_path());
+    } else {
+#if defined(_WIN32)
+        search_paths.push_back(std::string(user_search_path) + "\\");
+#else
+        search_paths.push_back(std::string(user_search_path) + "/");
+#endif
+    }

    int best_score = 0;
    std::string best_path;
@ -470,7 +487,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
        if (!fs::exists(search_path)) {
            continue;
        }
-        for (const auto & entry : fs::directory_iterator(search_path)) {
+        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
+        for (const auto & entry : dir_it) {
            if (entry.is_regular_file()) {
                std::string filename = entry.path().filename().string();
                std::string ext = entry.path().extension().string();
@ -490,6 +508,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
                                best_score = s;
                                best_path = entry.path().string();
                            }
+                        } else {
+                            if (!silent) {
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                            }
                        }
                    }
                }
@ -512,15 +534,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
 }

 void ggml_backend_load_all() {
-    ggml_backend_load_best("blas", true);
-    ggml_backend_load_best("cann", true);
-    ggml_backend_load_best("cuda", true);
-    ggml_backend_load_best("hip", true);
-    ggml_backend_load_best("kompute", true);
-    ggml_backend_load_best("metal", true);
-    ggml_backend_load_best("rpc", true);
-    ggml_backend_load_best("sycl", true);
-    ggml_backend_load_best("vulkan", true);
-    ggml_backend_load_best("musa", true);
-    ggml_backend_load_best("cpu", true);
+    ggml_backend_load_all_from_path(nullptr);
+}
+
+void ggml_backend_load_all_from_path(const char * dir_path) {
+#ifdef NDEBUG
+    bool silent = true;
+#else
+    bool silent = false;
+#endif
+
+    ggml_backend_load_best("blas", silent, dir_path);
+    ggml_backend_load_best("cann", silent, dir_path);
+    ggml_backend_load_best("cuda", silent, dir_path);
+    ggml_backend_load_best("hip", silent, dir_path);
+    ggml_backend_load_best("kompute", silent, dir_path);
+    ggml_backend_load_best("metal", silent, dir_path);
+    ggml_backend_load_best("rpc", silent, dir_path);
+    ggml_backend_load_best("sycl", silent, dir_path);
+    ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("opencl", silent, dir_path);
+    ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("cpu", silent, dir_path);
 }
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
 static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
    /* .get_name          = */ ggml_backend_cann_reg_get_name,
    /* .get_device_count  = */ ggml_backend_cann_reg_get_device_count,
-    /* .get_device_get    = */ ggml_backend_cann_reg_get_device,
+    /* .get_device        = */ ggml_backend_cann_reg_get_device,
    /* .get_proc_address  = */ ggml_backend_cann_reg_get_proc_address,
 };

--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -6,7 +6,20 @@
 typedef uint16_t ggml_half;
 typedef uint32_t ggml_half2;

-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CPP)
+#include <cstdint>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+// std-c++ allow anonymous unions but some compiler warn on it
+#define GGML_COMMON_AGGR_U data
+// std-c++ do not allow it.
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_METAL)
@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;

-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_CUDA)
@ -29,7 +43,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;

-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_HIP)
@ -39,7 +54,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;

-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_SYCL)
@ -49,7 +65,8 @@ typedef half2 ggml_half2;
 typedef sycl::half  ggml_half;
 typedef sycl::half2 ggml_half2;

-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data

 #define GGML_COMMON_DECL
 #endif
@ -154,9 +171,9 @@ typedef struct {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t qs[QK4_1 / 2]; // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@ -175,9 +192,9 @@ typedef struct {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
@ -196,37 +213,13 @@ typedef struct {
        struct {
            ggml_half d; // delta
            ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 ds;
-    };
+    } GGML_COMMON_AGGR_U;
    int8_t qs[QK8_1]; // quants
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");

-typedef struct {
-    ggml_half d[4];        // deltas for 4 q4_0 blocks
-    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
-} block_q4_0x4;
-static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
-
-typedef struct {
-    ggml_half d[8];        // deltas for 8 q4_0 blocks
-    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
-} block_q4_0x8;
-static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
-
-typedef struct {
-    ggml_half d[4];        // deltas for 4 q8_0 blocks
-    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
-} block_q8_0x4;
-static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
-
-typedef struct {
-    ggml_half d[8];        // deltas for 8 q8_0 blocks
-    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
-} block_q8_0x8;
-static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
-
 //
 // Ternary quantization
 //
@ -261,9 +254,9 @@ typedef struct {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

@ -288,9 +281,9 @@ typedef struct {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qs[QK_K/2];           // 4--bit quants
 } block_q4_K;
@ -305,9 +298,9 @@ typedef struct {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
        ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qh[QK_K/8];           // quants, high bit
    uint8_t qs[QK_K/2];           // quants, low 4 bits
@ -418,12 +411,6 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

-typedef struct {
-    ggml_half d[4];        // deltas for 4 iq4_nl blocks
-    uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
-} block_iq4_nlx4;
-static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
-
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL

@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
 #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
 #define GGML_TABLE_END() };

+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CPP)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_METAL)
 #include <metal_stdlib>
@ -479,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
 GGML_TABLE_END()

-//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
 GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -1,319 +1,358 @@
-ggml_add_backend_library(ggml-cpu)
-
-list (APPEND GGML_CPU_SOURCES
-    ggml-cpu.c
-    ggml-cpu.cpp
-    ggml-cpu-aarch64.c
-    ggml-cpu-aarch64.h
-    ggml-cpu-quants.c
-    ggml-cpu-quants.h
-    amx/amx.cpp
-    amx/amx.h
-    amx/mmq.cpp
-    amx/mmq.h
-    ggml-cpu-impl.h
-    )
-
-target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
-target_include_directories(ggml-cpu PRIVATE .)
-
-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
-        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
-        target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
-
-        target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
+function(ggml_add_cpu_backend_variant_impl tag_name)
+    if (tag_name)
+        set(GGML_CPU_NAME ggml-cpu-${tag_name})
    else()
-        message(WARNING "Accelerate framework not found")
+        set(GGML_CPU_NAME ggml-cpu)
    endif()
-endif()

-if (GGML_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
+    ggml_add_backend_library(${GGML_CPU_NAME})

-        target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
+    list (APPEND GGML_CPU_SOURCES
+        ggml-cpu/ggml-cpu.c
+        ggml-cpu/ggml-cpu.cpp
+        ggml-cpu/ggml-cpu-aarch64.cpp
+        ggml-cpu/ggml-cpu-aarch64.h
+        ggml-cpu/ggml-cpu-hbm.cpp
+        ggml-cpu/ggml-cpu-hbm.h
+        ggml-cpu/ggml-cpu-quants.c
+        ggml-cpu/ggml-cpu-quants.h
+        ggml-cpu/ggml-cpu-traits.cpp
+        ggml-cpu/ggml-cpu-traits.h
+        ggml-cpu/amx/amx.cpp
+        ggml-cpu/amx/amx.h
+        ggml-cpu/amx/mmq.cpp
+        ggml-cpu/amx/mmq.h
+        ggml-cpu/ggml-cpu-impl.h
+        )

-        target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    else()
-        message(WARNING "OpenMP not found")
+    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
+    target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
+
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
+
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
    endif()
-endif()

-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
+    if (GGML_OPENMP)
+        find_package(OpenMP)
+        if (OpenMP_FOUND)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)

-    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        else()
+            message(WARNING "OpenMP not found")
+        endif()
+    endif()

-    list(APPEND GGML_CPU_SOURCES
-                llamafile/sgemm.cpp
-                llamafile/sgemm.h)
-endif()
+    if (GGML_LLAMAFILE)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)

-if (GGML_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
+        list(APPEND GGML_CPU_SOURCES
+                    ggml-cpu/llamafile/sgemm.cpp
+                    ggml-cpu/llamafile/sgemm.h)
+    endif()

-    message(STATUS "Using memkind for CPU HBM")
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)

-    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
+        message(STATUS "Using memkind for CPU HBM")

-    target_link_libraries(ggml-cpu PUBLIC memkind)
-endif()
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)

-if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-    (NOT CMAKE_OSX_ARCHITECTURES      AND
-     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
+    endif()

-    message(STATUS "ARM detected")
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES      AND
+        NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))

-    if (MSVC)
-        list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
-        list(APPEND ARCH_DEFINITIONS __ARM_NEON)
-        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
+        message(STATUS "ARM detected")

-        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
+        if (MSVC)
+            list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
+            list(APPEND ARCH_DEFINITIONS __ARM_NEON)
+            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)

-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-        if (GGML_COMPILER_SUPPORT_DOTPROD)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
+            set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
+            string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")

-            message(STATUS "ARM feature DOTPROD enabled")
-        endif ()
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+            if (GGML_COMPILER_SUPPORT_DOTPROD)
+                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)

-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
-
-            message(STATUS "ARM feature MATMUL_INT8 enabled")
-        endif ()
-
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
-            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-            message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
-        endif ()
-
-        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
-    elseif (APPLE)
-        if (GGML_NATIVE)
-            set(USER_PROVIDED_MARCH FALSE)
-            foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
-                if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
-                    set(USER_PROVIDED_MARCH TRUE)
-                    break()
-                endif()
-            endforeach()
-
-            if (NOT USER_PROVIDED_MARCH)
-                set(MARCH_FLAGS "-march=armv8.2a")
-
-                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
-                if (GGML_COMPILER_SUPPORT_DOTPROD)
-                    set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
-                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
-
-                    message(STATUS "ARM feature DOTPROD enabled")
-                endif ()
-
-                set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
-
-                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
-
-                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-                    set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
-                    list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
-
-                    message(STATUS "ARM feature MATMUL_INT8 enabled")
-                endif ()
-
-                set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-                list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
+                message(STATUS "ARM feature DOTPROD enabled")
            endif ()
-        endif ()
-    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+
+            if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
+
+                message(STATUS "ARM feature MATMUL_INT8 enabled")
+            endif ()
+
+            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+            if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+                message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
+            endif ()
+
+            set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+        elseif (APPLE)
+            if (GGML_NATIVE)
+                set(USER_PROVIDED_MARCH FALSE)
+                foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
+                    if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
+                        set(USER_PROVIDED_MARCH TRUE)
+                        break()
+                    endif()
+                endforeach()
+
+                if (NOT USER_PROVIDED_MARCH)
+                    set(MARCH_FLAGS "-march=armv8.2a")
+
+                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+                    if (GGML_COMPILER_SUPPORT_DOTPROD)
+                        set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
+                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
+
+                        message(STATUS "ARM feature DOTPROD enabled")
+                    endif ()
+
+                    set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
+
+                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                    set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
+
+                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                    if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+                        set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
+                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
+
+                        message(STATUS "ARM feature MATMUL_INT8 enabled")
+                    endif ()
+
+                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+                    list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
+                endif ()
+            endif ()
+        else()
+            check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+            endif()
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+                # Raspberry Pi 1, Zero
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+            endif()
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+                if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                    # Android armeabi-v7a
+                    list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+                else()
+                    # Raspberry Pi 2
+                    list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+                endif()
+            endif()
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+                # Android arm64-v8a
+                # Raspberry Pi 3, 4, Zero 2 (32-bit)
+                list(APPEND ARCH_FLAGS -mno-unaligned-access)
+            endif()
+            if (GGML_SVE)
+                list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
            endif()
        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
-        endif()
-        if (GGML_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
-    endif()
-elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-    message(STATUS "x86 detected")
-    if (MSVC)
-        # instruction set detection for MSVC only
-        if (GGML_NATIVE)
-            include(cmake/FindSIMD.cmake)
-        endif ()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+        if (MSVC)
+            # instruction set detection for MSVC only
+            if (GGML_NATIVE)
+                include(ggml-cpu/cmake/FindSIMD.cmake)
+            endif ()
+            if (GGML_AVX512)
+                list(APPEND ARCH_FLAGS /arch:AVX512)
+                # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
+                # MSVC has no compile-time flags enabling specific
+                # AVX512 extensions, neither it defines the
+                # macros corresponding to the extensions.
+                # Do it manually.
+                list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    endif()
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vnni)
+                    endif()
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512bf16)
+                    endif()
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
+                endif()
+            elseif (GGML_AVX2)
+                list(APPEND ARCH_FLAGS /arch:AVX2)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
+            elseif (GGML_AVX)
+                list(APPEND ARCH_FLAGS /arch:AVX)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX)
+            else ()
+                list(APPEND ARCH_FLAGS /arch:SSE4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+            endif()
+            if (GGML_AVX_VNNI)
+                # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2
+                #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
+            endif()
+        else ()
+            if (GGML_NATIVE)
+                list(APPEND ARCH_FLAGS -march=native)
+            else ()
+                list(APPEND ARCH_FLAGS -msse4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                if (GGML_F16C)
+                    list(APPEND ARCH_FLAGS -mf16c)
+                    list(APPEND ARCH_DEFINITIONS GGML_F16C)
+                endif()
+                if (GGML_FMA)
+                    list(APPEND ARCH_FLAGS -mfma)
+                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
+                endif()
+                if (GGML_AVX)
+                    list(APPEND ARCH_FLAGS -mavx)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
+                endif()
+                if (GGML_AVX2)
+                    list(APPEND ARCH_FLAGS -mavx2)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX2)
+                endif()
+                if (GGML_AVX_VNNI)
+                    list(APPEND ARCH_FLAGS -mavxvnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
+                endif()
+                if (GGML_AVX512)
+                    list(APPEND ARCH_FLAGS -mavx512f)
+                    list(APPEND ARCH_FLAGS -mavx512cd)
+                    list(APPEND ARCH_FLAGS -mavx512vl)
+                    list(APPEND ARCH_FLAGS -mavx512dq)
+                    list(APPEND ARCH_FLAGS -mavx512bw)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                endif()
+                if (GGML_AVX512_VBMI)
                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
                endif()
-            endif()
-            if (GGML_AVX512_VNNI)
-                list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                if (GGML_AVX512_VNNI)
                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
                endif()
-            endif()
-            if (GGML_AVX512_BF16)
-                list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
-                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                if (GGML_AVX512_BF16)
                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_FLAGS -mamx-tile)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_FLAGS -mamx-int8)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_FLAGS -mamx-bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
                endif()
            endif()
-            if (GGML_AMX_TILE)
-                list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
-            endif()
-            if (GGML_AMX_INT8)
-                list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
-            endif()
-            if (GGML_AMX_BF16)
-                list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
-            endif()
-        elseif (GGML_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
-        elseif (GGML_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
-        if (GGML_AVX_VNNI)
-            list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
-            if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
-                list(APPEND ARCH_FLAGS -mavxvnni)
-            endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+        message(STATUS "PowerPC detected")
+        execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+        string(FIND "${POWER10_M}" "POWER10" substring_index)
+        if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+            set(substring_index -1)
+        endif()
+
+        if (${substring_index} GREATER_EQUAL 0)
+        list(APPEND ARCH_FLAGS -mcpu=power10)
+        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+        else()
+            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+            # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        message(STATUS "loongarch64 detected")
+
+        list(APPEND ARCH_FLAGS -march=loongarch64)
+        if (GGML_LASX)
+            list(APPEND ARCH_FLAGS -mlasx)
+        endif()
+        if (GGML_LSX)
+            list(APPEND ARCH_FLAGS -mlsx)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        message(STATUS "RISC-V detected")
+        if (GGML_RVV)
+            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
        endif()
    else()
-        if (GGML_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (GGML_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
-        if (GGML_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
-        endif()
-        if (GGML_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
-        endif()
-        if (GGML_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
-        endif()
-        if (GGML_AVX_VNNI)
-            list(APPEND ARCH_FLAGS -mavxvnni)
-        endif()
-        if (GGML_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512dq)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (GGML_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (GGML_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-        if (GGML_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
-        if (GGML_AMX_TILE)
-            list(APPEND ARCH_FLAGS -mamx-tile)
-        endif()
-        if (GGML_AMX_INT8)
-            list(APPEND ARCH_FLAGS -mamx-int8)
-        endif()
-        if (GGML_AMX_BF16)
-            list(APPEND ARCH_FLAGS -mamx-bf16)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
-    string(FIND "${POWER10_M}" "POWER10" substring_index)
-    if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
-        set(substring_index -1)
+        message(STATUS "Unknown architecture")
    endif()

-    if (${substring_index} GREATER_EQUAL 0)
-       list(APPEND ARCH_FLAGS -mcpu=power10)
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    if (GGML_CPU_AARCH64)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    message(STATUS "loongarch64 detected")

-    list(APPEND ARCH_FLAGS -march=loongarch64)
-    if (GGML_LASX)
-        list(APPEND ARCH_FLAGS -mlasx)
+    message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
+    target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
+    target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
+    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
+
+    if (GGML_BACKEND_DL)
+        # The feature detection code is compiled as a separate target so that
+        # it can be built without the architecture flags
+        # Since multiple variants of the CPU backend may be included in the same
+        # build, using set_source_files_properties() to set the arch flags is not possible
+        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
+        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
+        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
    endif()
-    if (GGML_LSX)
-        list(APPEND ARCH_FLAGS -mlsx)
+
+    if (EMSCRIPTEN)
+        set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
-    message(STATUS "RISC-V detected")
-    if (GGML_RVV)
-        list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
-    endif()
-else()
-    message(STATUS "Unknown architecture")
-endif()
-
-if (GGML_CPU_AARCH64)
-    message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
-    target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
-endif()
-
-target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
-set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}")
-set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
-
-# the feature detection code must be compiled without any architecture flags
-target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
-# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
-
-if (EMSCRIPTEN)
-    set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
-endif()
+endfunction()
--- a/ggml/src/ggml-cpu/amx/amx.cpp
+++ b/ggml/src/ggml-cpu/amx/amx.cpp
@ -5,6 +5,7 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
+#include "ggml-cpu-traits.h"

 #if defined(__gnu_linux__)
 #include <sys/syscall.h>
@ -17,31 +18,65 @@

 #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)

+// AMX type_trais
+namespace ggml::cpu::amx {
+class tensor_traits : public ggml::cpu::tensor_traits {
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        size = ggml_backend_amx_desired_wsize(op);
+        return true;
+    }
+
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT) {
+            ggml_backend_amx_mul_mat(params, op);
+            return true;
+        }
+        return false;
+    }
+};
+
+static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
+    static tensor_traits traits;
+    return &traits;
+}
+}  // namespace ggml::cpu::amx
+
 // AMX buffer interface
 static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    free(buffer->context);
 }

 static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)(buffer->context);
+    return (void *) (buffer->context);
 }

-static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-    memset((char *)tensor->data + offset, value, size);
+static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);

    GGML_UNUSED(buffer);
 }

-static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    memset((char *) tensor->data + offset, value, size);
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                               const void * data, size_t offset, size_t size) {
    if (qtype_has_amx_kernels(tensor->type)) {
+        GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
        ggml_backend_amx_convert_weight(tensor, data, offset, size);
    } else {
-        memcpy((char *)tensor->data + offset, data, size);
+        memcpy((char *) tensor->data + offset, data, size);
    }

    GGML_UNUSED(buffer);
 }

+/*
+// need to figure what we need to do with buffer->extra.
 static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
    memcpy(data, (const char *)tensor->data + offset, size);
@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con

    GGML_UNUSED(buffer);
 }
+*/

 static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    memset(buffer->context, value, buffer->size);
@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
    /* .free_buffer     = */ ggml_backend_amx_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_amx_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
+    /* .init_tensor     = */ ggml_backend_amx_buffer_init_tensor,
    /* .memset_tensor   = */ ggml_backend_amx_buffer_memset_tensor,
    /* .set_tensor      = */ ggml_backend_amx_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_amx_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_amx_buffer_cpy_tensor,
+    /* .get_tensor      = */ nullptr,
+    /* .cpy_tensor      = */ nullptr,
    /* .clear           = */ ggml_backend_amx_buffer_clear,
-    /* .reset           = */ NULL,
+    /* .reset           = */ nullptr,
 };

 static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
@ -86,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
 }

 static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+    void * data = ggml_aligned_malloc(size);
    if (data == NULL) {
        fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
        return NULL;
@ -101,18 +137,48 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
    GGML_UNUSED(buft);
 }

-static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
+namespace ggml::cpu::amx {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        // handle only 2d gemm for now
+        auto is_contiguous_2d = [](const struct ggml_tensor * t) {
+            return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
+        };
+
+        if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) &&  // src0 must be contiguous
+            is_contiguous_2d(op->src[1]) &&                               // src1 must be contiguous
+            op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
+            op->ne[0] % (TILE_N * 2) == 0 &&                              // out_features is 32x
+            (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
+            // src1 must be host buffer
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            // src1 must be float32
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
+            op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
+            return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+        }
+
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::amx
+
+static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    return ggml_backend_amx_get_alloc_size(tensor);

    GGML_UNUSED(buft);
 }

-static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    GGML_UNUSED(buft);
-}
-
 #define ARCH_GET_XCOMP_PERM     0x1022
 #define ARCH_REQ_XCOMP_PERM     0x1023
 #define XFEATURE_XTILECFG       17
@ -129,68 +195,26 @@ static bool ggml_amx_init() {
    return true;
 #endif
 }
+
 ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
    static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
-            /* .is_host          = */ ggml_backend_amx_buffer_type_is_host,
-        },
+                        /* .get_name         = */ ggml_backend_amx_buffer_type_get_name,
+                        /* .alloc_buffer     = */ ggml_backend_amx_buffer_type_alloc_buffer,
+                        /* .get_alignment    = */ ggml_backend_amx_buffer_type_get_alignment,
+                        /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                        /* .get_alloc_size   = */ ggml_backend_amx_buffer_type_get_alloc_size,
+                        /* .is_host          = */ nullptr,
+                        },
        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ NULL,
+        /* .context = */ new ggml::cpu::amx::extra_buffer_type(),
    };

    if (!ggml_amx_init()) {
-        return NULL;
+        return nullptr;
    }

    return &ggml_backend_buffer_type_amx;
 }

-bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
-}
-
-bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
-    // handle only 2d gemm for now
-    auto is_contiguous_2d = [](const struct ggml_tensor * t) {
-        return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
-    };
-
-    switch (op->op) {
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            return true;
-
-        case GGML_OP_MUL_MAT: {
-            const struct ggml_tensor * src0 = op->src[0];
-            const struct ggml_tensor * src1 = op->src[1];
-
-            const enum ggml_type type = src0->type;
-            const int64_t ne0 = op->ne[0];
-
-            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
-            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
-            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
-
-            bool can_use_amx =
-                is_contiguous_2d(src0) &&       // src0 must be contiguous
-                is_contiguous_2d(src1) &&       // src1 must be contiguous
-                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
-                has_amx_kernels &&              // with amx kernel impls
-                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
-
-            return can_use_amx;
-        }
-        default:
-            return false;
-    }
-}
-
-#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+#endif  // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
--- a/ggml/src/ggml-cpu/amx/amx.h
+++ b/ggml/src/ggml-cpu/amx/amx.h
@ -1,20 +1,8 @@
 #include "ggml-backend.h"
 #include "ggml-cpu-impl.h"

-#ifdef __cplusplus
-extern "C" {
-#endif
+// GGML internal header

 #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-
 ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
-bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
-void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
-
-#endif
-
-#ifdef __cplusplus
-}
 #endif
--- a/ggml/src/ggml-cpu/amx/common.h
+++ b/ggml/src/ggml-cpu/amx/common.h
@ -7,7 +7,7 @@
 #include <memory>
 #include <type_traits>

-#if defined(_OPENMP)
+#if defined(GGML_USE_OPENMP)
 #include <omp.h>
 #endif

@ -56,11 +56,11 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
 }

 template <typename func_t>
-inline void parallel_for(int nth, int n, const func_t& f) {
-#if defined(_OPENMP)
-#pragma omp parallel num_threads(nth)
+inline void parallel_for(int n, const func_t& f) {
+#if defined(GGML_USE_OPENMP)
+#pragma omp parallel
 {
-    //int nth = omp_get_num_threads();
+    int nth = omp_get_num_threads();
    int ith = omp_get_thread_num();
    int tbegin, tend;
    balance211(n, nth, ith, tbegin, tend);
@ -68,8 +68,6 @@ inline void parallel_for(int nth, int n, const func_t& f) {
 }
 #else
    f(0, n);
-
-    GGML_UNUSED(nth);
 #endif
 }

@ -91,10 +89,3 @@ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
        (type == GGML_TYPE_Q6_K) ||
        (type == GGML_TYPE_IQ4_XS);
 }
-
-// ggml backend context
-struct ggml_backend_amx_context {
-    int n_threads = GGML_DEFAULT_N_THREADS;
-    std::unique_ptr<char[]> work_data;
-    size_t work_size = 0;
-};
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@ -18,10 +18,6 @@
 #include <unistd.h>
 #endif

-#if defined(_OPENMP)
-#include <omp.h>
-#endif
-
 #if (defined(_WIN32) || defined(_WIN64))
 #define RESTRICT __restrict
 #else
@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
 #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size

 template<typename TB, int BLOCK_K>
-void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
+void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
    const int NB = N / TILE_N;
    const int KB = K / BLOCK_K;
    const int TILE_SIZE = get_tile_size<TB>();

    // parallel on NB should be enough
-    parallel_for(n_threads, NB, [&](int begin, int end) {
+    parallel_for(NB, [&](int begin, int end) {
        for (int n = begin; n < end; ++n) {
            for (int k = 0; k < KB; ++k) {
                int n0 = n * TILE_N;
@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
    const int K = tensor->ne[0]; // ne0: in_features
    const int N = tensor->ne[1]; // ne1: out_features

-#if defined(_OPENMP)
-    // the buffer ctx is not initialized when .set_tensor is called
-    int n_threads = omp_get_num_threads();
-#else
-    int n_threads = 1;
-#endif
-
    GGML_DISPATCH_QTYPES(TYPE, [&] {
-        convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
+        convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
    });
 }

--- a/ggml/src/ggml-cpu/amx/mmq.h
+++ b/ggml/src/ggml-cpu/amx/mmq.h
@ -1,16 +1,10 @@
 #pragma once
 #include "common.h"

-#ifdef __cplusplus
-extern "C" {
-#endif
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);

 size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);

 void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

 void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@ -1,4 +1,3 @@
-#include "ggml-cpu.h"
 #include "ggml-backend-impl.h"

 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
@ -13,6 +12,7 @@
 #include <array>
 #include <string>

+// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
 struct cpuid_x86 {
    bool SSE3(void) { return f_1_ecx[0]; }
    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
@ -50,11 +50,15 @@ struct cpuid_x86 {
    bool INVPCID(void) { return f_7_ebx[10]; }
    bool RTM(void) { return is_intel && f_7_ebx[11]; }
    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool AVX512DQ(void) { return f_7_ebx[17]; }
    bool RDSEED(void) { return f_7_ebx[18]; }
    bool ADX(void) { return f_7_ebx[19]; }
    bool AVX512PF(void) { return f_7_ebx[26]; }
    bool AVX512ER(void) { return f_7_ebx[27]; }
    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool AVX512BW(void) { return f_7_ebx[30]; }
+    bool AVX512VL(void) { return f_7_ebx[31]; }
+
    bool SHA(void) { return f_7_ebx[29]; }

    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
@ -259,36 +263,57 @@ void test_x86_is() {
 static int ggml_backend_cpu_x86_score() {
    // FIXME: this does not check for OS support

-    cpuid_x86 is;
-    // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
-    if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
-    if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
-    if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
-    if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
-    if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
-    if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
-    if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
-    if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
-    if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
-    if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
-    if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
-    if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
-
-    // calculate a backend score based on the supported features
-    // more important features have a higher weight
    int score = 0;
-    score +=  ggml_cpu_has_fma        () * 1;
-    score +=  ggml_cpu_has_f16c       () * 1<<1;
-    score +=  ggml_cpu_has_ssse3      () * 1<<2;
-    score +=  ggml_cpu_has_sse3       () * 1<<3;
-    score +=  ggml_cpu_has_avx_vnni   () * 1<<4;
-    score +=  ggml_cpu_has_avx        () * 1<<5;
-    score +=  ggml_cpu_has_avx2       () * 1<<6;
-    score +=  ggml_cpu_has_avx512     () * 1<<7;
-    // score +=  ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
-    score +=  ggml_cpu_has_avx512_bf16() * 1<<9;
-    score +=  ggml_cpu_has_avx512_vnni() * 1<<10;
-    score +=  ggml_cpu_has_amx_int8   () * 1<<11;
+    cpuid_x86 is;
+
+#ifdef GGML_FMA
+    if (!is.FMA()) { return 0; }
+    score += 1;
+#endif
+#ifdef GGML_F16C
+    if (!is.F16C()) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_SSE42
+    if (!is.SSE42()) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_AVX
+    if (!is.AVX()) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_AVX2
+    if (!is.AVX2()) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_AVX_VNNI
+    if (!is.AVX_VNNI()) { return 0; }
+    score += 1<<6;
+#endif
+#ifdef GGML_AVX512
+    if (!is.AVX512F()) { return 0; }
+    if (!is.AVX512CD()) { return 0; }
+    if (!is.AVX512VL()) { return 0; }
+    if (!is.AVX512DQ()) { return 0; }
+    if (!is.AVX512BW()) { return 0; }
+    score += 1<<7;
+#endif
+#ifdef GGML_AVX512_VBMI
+    if (!is.AVX512_VBMI()) { return 0; }
+    score += 1<<8;
+#endif
+#ifdef GGML_AVX512_BF16
+    if (!is.AVX512_BF16()) { return 0; }
+    score += 1<<9;
+#endif
+#ifdef GGML_AVX512_VNNI
+    if (!is.AVX512_VNNI()) { return 0; }
+    score += 1<<10;
+#endif
+#ifdef GGML_AMX_INT8
+    if (!is.AMX_INT8()) { return 0; }
+    score += 1<<11;
+#endif

    return score;
 }
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@ -1,20 +1,57 @@
-#define GGML_COMMON_IMPL_C
+#define GGML_COMMON_IMPL_CPP
+#define GGML_COMMON_DECL_CPP
 #include "ggml-common.h"
+#include "ggml-backend-impl.h"

 #include "ggml-quants.h"
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
-#include "ggml-cpu/ggml-cpu-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-cpu-traits.h"

-#include <math.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <stdlib.h> // for qsort
-#include <stdio.h>  // for GGML_ASSERT
+#include <cmath>
+#include <cstring>
+#include <cassert>
+#include <cfloat>
+#include <cstdlib> // for qsort
+#include <cstdio>  // for GGML_ASSERT

 #include "ggml-cpu-aarch64.h"

+// TODO: move to include file?
+template <int K> constexpr int QK_0() {
+    if constexpr (K == 4) {
+        return QK4_0;
+    }
+    if constexpr (K == 8) {
+        return QK8_0;
+    }
+    return -1;
+}
+
+template <int K, int N> struct block {
+    ggml_half d[N];                         // deltas for N qK_0 blocks
+    int8_t    qs[(QK_0<K>() * N * K) / 8];  // quants for N qK_0 blocks
+};
+
+// control size
+static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
+static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
+static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
+static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
+
+using block_q4_0x4 = block<4, 4>;
+using block_q4_0x8 = block<4, 8>;
+using block_q8_0x4 = block<8, 4>;
+using block_q8_0x8 = block<8, 8>;
+
+struct block_iq4_nlx4 {
+    ggml_half d[4];            // deltas for 4 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 2];  // nibbles / quants for 4 iq4_nl blocks
+};
+
+static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #elif defined(_MSC_VER)
@ -185,12 +222,12 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)

 static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

-static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
+static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
    assert(k % QK8_0 == 0);
    const int nb = k / QK8_0;

-    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;

 #if defined(__ARM_NEON)
    float32x4_t srcv[4][8];
@ -279,12 +316,12 @@ static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int6
 #endif
 }

-static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) {
+static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
    assert(k % QK8_0 == 0);
    const int nb = k / QK8_0;

-    block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
+    block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;

 #if defined(__ARM_NEON)
    float32x4_t srcv[4][8];
@ -494,7 +531,7 @@ static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int6
 #endif
 }

-void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
+static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
    assert(nrow == 4);
    UNUSED(nrow);
    if (blck_size_interleave == 4) {
@ -506,7 +543,7 @@ void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nro
    }
 }

-void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
@ -591,7 +628,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }

-void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
@ -701,7 +738,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }

-void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 8;
@ -974,7 +1011,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }

-void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
@ -1070,7 +1107,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
    }
 }

-void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
@ -1586,7 +1623,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }

-void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
@ -2040,7 +2077,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }

-void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 8;
@ -2560,31 +2597,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)

                    // Shuffle pattern one - right side input
-                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
-                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)

-                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
-                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)

-                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
-                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)

-                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
-                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)

                    // Shuffle pattern two - right side input

-                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
-                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)

-                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
-                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)

-                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
-                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)

-                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
-                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)

                    // Scale values - Load the weight scale values of two block_q4_0x8
                    const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
@ -2618,31 +2655,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *

                        // Shuffle pattern one - left side input

-                        const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                        const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+                        const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                        const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)

-                        const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                        const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+                        const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                        const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)

-                        const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                        const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+                        const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                        const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)

-                        const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                        const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+                        const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                        const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)

                        // Shuffle pattern two - left side input

-                        const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                        const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+                        const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                        const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)

-                        const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                        const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+                        const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                        const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)

-                        const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                        const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+                        const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                        const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)

-                        const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                        const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+                        const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                        const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)

                        // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
                        // Resembles MMLAs into 2x2 matrices in ARM Version
@ -2671,10 +2708,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *


                        // Straighten out to make 4 row vectors
-                        __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
-                        __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
-                        __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
-                        __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+                        __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
+                        __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
+                        __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
+                        __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);

                        // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                        const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
@ -2753,31 +2790,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
                    const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)

                    // Shuffle pattern one - right side input
-                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
-                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
+                    const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
+                    const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)

-                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
-                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
+                    const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
+                    const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)

-                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
-                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
+                    const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
+                    const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)

-                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
-                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
+                    const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
+                    const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)

                    // Shuffle pattern two - right side input

-                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
-                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
+                    const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
+                    const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)

-                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
-                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
+                    const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
+                    const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)

-                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
-                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
+                    const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
+                    const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)

-                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
-                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
+                    const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
+                    const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)


                    // Scale values - Load the weight scale values of two block_q4_0x8
@ -2809,31 +2846,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *

                    // Shuffle pattern one - left side input

-                    const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
-                    const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
+                    const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160);  //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
+                    const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160);  //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)

-                    const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
-                    const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
+                    const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160);  //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
+                    const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160);  //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)

-                    const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
-                    const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
+                    const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160);  //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
+                    const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160);  //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)

-                    const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
-                    const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
+                    const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160);  //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
+                    const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160);  //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)

                    // Shuffle pattern two - left side input

-                    const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
-                    const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
+                    const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245);  //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
+                    const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245);  //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)

-                    const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
-                    const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
+                    const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245);  //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
+                    const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245);  //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)

-                    const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
-                    const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
+                    const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245);  //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
+                    const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245);  //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)

-                    const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
-                    const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
+                    const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245);  //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
+                    const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245);  //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)

                    // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
                    // Resembles MMLAs into 2x2 matrices in ARM Version
@ -2862,10 +2899,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *


                    // Straighten out to make 4 row vectors
-                    __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
-                    __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
-                    __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
-                    __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
+                    __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
+                    __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
+                    __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
+                    __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);

                    // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                    const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
@ -3460,7 +3497,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    }
 }

-void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
+static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
    const int qk = QK8_0;
    const int nb = n / qk;
    const int ncols_interleaved = 4;
@ -3571,7 +3608,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
    }
 }

-// FIXME: this code is duplicated from ggml-aarch64.c
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
    block_q4_0x4 out;

@ -3641,20 +3677,20 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
    return out;
 }

-static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
+static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    constexpr int nrows_interleaved = 4;

    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
    const block_q4_0 * src = (const block_q4_0 *)data;
    block_q4_0 dst_tmp[4];
-    int nrow = t->ne[1]; // Number of rows
-    int nrows_interleaved = 4;
+    int nrow = ggml_nrows(t);
    int nblocks = t->ne[0] / QK4_0;

    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

-    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
        return -1;
    }

@ -3672,20 +3708,20 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
    GGML_UNUSED(data_size);
 }

-static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) {
+static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
    GGML_ASSERT(interleave_block == 8);
+    constexpr int nrows_interleaved = 8;

    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
    const block_q4_0 * src = (const block_q4_0*) data;
    block_q4_0 dst_tmp[8];
-    int nrow = t->ne[1]; // Number of rows
-    int nrows_interleaved = 8;
+    int nrow = ggml_nrows(t);
    int nblocks = t->ne[0] / QK4_0;

    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

-    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
        return -1;
    }

@ -3712,16 +3748,18 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s

    const int end = QK4_NL * 2 / blck_size_interleave;

-    if (blck_size_interleave == 8) {
-        for (int i = 0; i < end; ++i) {
-            int src_id = i % 4;
-            int src_offset = (i / 4) * blck_size_interleave;
-            int dst_offset = i * blck_size_interleave;
+    // TODO: this branch seems wrong
+    //if (blck_size_interleave == 8) {
+    //    for (int i = 0; i < end; ++i) {
+    //        int src_id = i % 4;
+    //        int src_offset = (i / 4) * blck_size_interleave;
+    //        int dst_offset = i * blck_size_interleave;

-            // Using memcpy to avoid unaligned memory accesses
-            memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
-        }
-    } else if (blck_size_interleave == 4) {
+    //        // Using memcpy to avoid unaligned memory accesses
+    //        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
+    //    }
+    //} else
+    if (blck_size_interleave == 4) {
        for (int i = 0; i < end; ++i) {
            int src_id = i % 4;
            int src_offset = (i / 4) * blck_size_interleave;
@ -3736,20 +3774,21 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
    return out;
 }

-static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
+static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
    GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
-    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    GGML_ASSERT(interleave_block == 4);

    block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
    const block_iq4_nl * src = (const block_iq4_nl *)data;
    block_iq4_nl dst_tmp[4];
-    int nrow = t->ne[1]; // Number of rows
+    int nrow = ggml_nrows(t);
    int nrows_interleaved = 4;
    int nblocks = t->ne[0] / QK4_0;

    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));

-    if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
        return -1;
    }

@ -3767,57 +3806,457 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
    GGML_UNUSED(data_size);
 }

-// Prepare for optimized kernels if applicable
-void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
-    if (cur->type == repack_type) {
-        memcpy(cur->data, data, data_size);
-        return;
-    }
+namespace ggml::cpu::aarch64 {
+// repack
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+int repack(struct ggml_tensor *, const void *, size_t);

-    if (cur->type == GGML_TYPE_Q4_0) {
-        switch (repack_type) {
-            case GGML_TYPE_Q4_0_8_8:
-                repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
-                break;
-            case GGML_TYPE_Q4_0_4_8:
-                repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
-                break;
-            case GGML_TYPE_Q4_0_4_4:
-                repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
-                break;
-            default:
-                GGML_ABORT("Unsupported type");
-        }
-    } else if (cur->type == GGML_TYPE_IQ4_NL) {
-        switch (repack_type) {
-            case GGML_TYPE_IQ4_NL_4_4:
-                repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
-                break;
-            default:
-                GGML_ABORT("Unsupported type");
-        }
-    } else {
-        GGML_ABORT("Unsupported type");
-    }
+// TODO: generalise.
+template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
 }

-enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
+template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
+}
+
+template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
+}
+
+// TODO: needs to be revisited
+//template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+//    return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
+//}
+
+// gemv
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+void gemv(int, float *, size_t, const void *, const void *, int, int);
+
+template <> void gemv<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <>
+void gemv<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+// gemm
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
+void gemm(int, float *, size_t, const void *, const void *, int, int);
+
+template <> void gemm<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <>
+void gemm<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+class tensor_traits_base : public ggml::cpu::tensor_traits {
+  public:
+    virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
+};
+
+template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_traits : public tensor_traits_base {
+
+    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
+        // not realy a GGML_TYPE_Q8_0 but same size.
+        switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
+            return true;
+        case GGML_OP_MUL_MAT_ID:
+            size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
+            size = GGML_PAD(size, sizeof(int64_t));  // + padding for next bloc.
+            size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
+            return true;
+        default:
+            // GGML_ABORT("fatal error");
+            break;
+        }
+        return false;
+    }
+
+    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
+        switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            forward_mul_mat(params, op);
+            return true;
+        case GGML_OP_MUL_MAT_ID:
+            forward_mul_mat_id(params, op);
+            return true;
+        default:
+            // GGML_ABORT("fatal error");
+            break;
+        }
+        return false;
+    }
+
+    void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        ggml_tensor *       dst  = op;
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        GGML_ASSERT(ne0 == ne01);
+        GGML_ASSERT(ne1 == ne11);
+        GGML_ASSERT(ne2 == ne12);
+        GGML_ASSERT(ne3 == ne13);
+
+        // dst cannot be transposed or permuted
+        GGML_ASSERT(nb0 == sizeof(float));
+        GGML_ASSERT(nb0 <= nb1);
+        GGML_ASSERT(nb1 <= nb2);
+        GGML_ASSERT(nb2 <= nb3);
+
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
+        // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
+
+        char *       wdata = static_cast<char *>(params->wdata);
+        const size_t nbw1  = ggml_row_size(GGML_TYPE_Q8_0, ne10);
+
+        assert(params->wsize >= nbw1 * ne11);
+
+        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
+
+        int64_t i11_processed = 0;
+        for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
+            quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10,
+                              INTER_SIZE);
+        }
+        i11_processed = ne11 - ne11 % 4;
+        for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        const void * src1_wdata      = params->wdata;
+        const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10);
+        int64_t      src0_start      = (ith * ne01) / nth;
+        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+        src0_end   = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
+        if (src0_start >= src0_end) {
+            return;
+        }
+
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (ne11 > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data) + src0_start, ne01,
+                                                 (const char *) src0->data + src0_start * nb01,
+                                                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                                                 (const char *) src0->data + src0_start * nb01,
+                                                 (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                                                 src0_end - src0_start);
+        }
+    }
+
+    void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
+        const ggml_tensor * src0 = op->src[0];
+        const ggml_tensor * src1 = op->src[1];
+        const ggml_tensor * ids  = op->src[2];
+        ggml_tensor *       dst  = op;
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
+
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+        GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+        // dst cannot be transposed or permuted
+        GGML_ASSERT(nb0 == sizeof(float));
+        GGML_ASSERT(nb0 <= nb1);
+        GGML_ASSERT(nb1 <= nb2);
+        GGML_ASSERT(nb2 <= nb3);
+
+        GGML_ASSERT(ne03 == 1);
+        GGML_ASSERT(ne13 == 1);
+        GGML_ASSERT(ne3  == 1);
+
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        // row groups
+        const int n_ids = ids->ne[0]; // n_expert_used
+        const int n_as  = ne02;       // n_expert
+
+        const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        struct mmid_row_mapping {
+            int32_t i1;
+            int32_t i2;
+        };
+
+        GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
+                                      n_as * ne12 * sizeof(mmid_row_mapping)));
+
+        auto                      wdata             = (char *) params->wdata;
+        auto                      wdata_src1_end    = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
+        int64_t *                 matrix_row_counts = (int64_t *) (wdata_src1_end);                      // [n_as]
+        struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as);  // [n_as][ne12]
+
+        // src1: float32 => block_q8_0
+        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+            for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
+                           (void *)               (wdata + i12 * nbw2 + i11 * nbw1),
+                           ne10);
+            }
+        }
+
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
+
+        if (ith == 0) {
+            // initialize matrix_row_counts
+            memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
+
+            // group rows by src0 matrix
+            for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+                for (int32_t id = 0; id < n_ids; ++id) {
+                    const int32_t i02 =
+                        *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
+
+                    GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+                    MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
+                    matrix_row_counts[i02] += 1;
+                }
+            }
+        }
+
+        ggml_barrier(params->threadpool);
+
+        // compute each matrix multiplication in sequence
+        for (int cur_a = 0; cur_a < n_as; ++cur_a) {
+            const int64_t cne1 = matrix_row_counts[cur_a];
+
+            if (cne1 == 0) {
+                continue;
+            }
+
+            auto src0_cur = (const char *) src0->data + cur_a*nb02;
+
+            //const int64_t nr0 = ne01; // src0 rows
+            const int64_t nr1 = cne1; // src1 rows
+
+            int64_t src0_cur_start = (ith * ne01) / nth;
+            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
+            src0_cur_start =
+                (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
+            src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
+
+            if (src0_cur_start >= src0_cur_end) return;
+
+            for (int ir1 = 0; ir1 < nr1; ir1++) {
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
+                const int id       = row_mapping.i1; // selected expert index
+
+                const int64_t  i11 = id % ne11;
+                const int64_t  i12 = row_mapping.i2; // row index in src1
+
+                const int64_t  i1 = id;  // selected expert index
+                const int64_t  i2 = i12; // row
+
+                auto src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
+
+                gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(
+                        ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start,
+                        ne01,                    src0_cur + src0_cur_start * nb01,
+                        src1_col, 1, src0_cur_end - src0_cur_start);
+            }
+        }
+#undef MMID_MATRIX_ROW
+    }
+
+    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
+                       (int) NB_COLS, (int) INTER_SIZE);
+        return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+    }
+};
+
+// instance for Q4
+static const tensor_traits<block_q4_0, 4, 4> q4_0_4x4_q8_0;
+static const tensor_traits<block_q4_0, 8, 4> q4_0_4x8_q8_0;
+static const tensor_traits<block_q4_0, 8, 8> q4_0_8x8_q8_0;
+
+// instance for IQ4
+static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
+
+}  // namespace ggml::cpu::aarch64
+
+static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
    if (cur->type == GGML_TYPE_Q4_0) {
-        // TODO: enable for AVX2 - currently disabled due to bad gemv performance
-        if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
-            return GGML_TYPE_Q4_0_8_8;
+        if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+            if (cur->ne[1] % 8 == 0) {
+                return &ggml::cpu::aarch64::q4_0_8x8_q8_0;
+            }
        }
        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-            return GGML_TYPE_Q4_0_4_8;
+            if (cur->ne[1] % 4 == 0) {
+                return &ggml::cpu::aarch64::q4_0_4x8_q8_0;
+            }
        }
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-            return GGML_TYPE_Q4_0_4_4;
+            if (cur->ne[1] % 4 == 0) {
+                return &ggml::cpu::aarch64::q4_0_4x4_q8_0;
+            }
        }
    } else if (cur->type == GGML_TYPE_IQ4_NL) {
        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-            return GGML_TYPE_IQ4_NL_4_4;
+            if (cur->ne[1] % 4 == 0) {
+                return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0;
+            }
        }
    }

-    return cur->type;
+    return nullptr;
+}
+
+static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+                                                       const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra;
+    auto OK            = tensor_traits->repack(tensor, data, size);
+
+    GGML_ASSERT(OK == 0);
+    GGML_UNUSED(buffer);
+}
+
+static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_AARCH64";
+
+    GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+
+    if (buffer == nullptr) {
+        return nullptr;
+    }
+
+    buffer->buft              = buft;
+    buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
+    buffer->iface.set_tensor  = ggml_backend_cpu_aarch64_buffer_set_tensor;
+    return buffer;
+}
+
+static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return TENSOR_ALIGNMENT;
+
+    GGML_UNUSED(buft);
+}
+
+namespace ggml::cpu::aarch64 {
+class extra_buffer_type : ggml::cpu::extra_buffer_type {
+    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
+        if (    op->op == GGML_OP_MUL_MAT &&
+                op->src[0]->buffer &&
+                (ggml_n_dims(op->src[0]) == 2) &&
+                op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
+                ggml_aarch64_get_optimal_repack_type(op->src[0])
+                ) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
+            //    return true;
+            //}
+            // may be possible if Q8_0 packed...
+        } else if (op->op == GGML_OP_MUL_MAT_ID
+                && op->src[0]->buffer
+                && (ggml_n_dims(op->src[0]) == 3)
+                && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()
+                && ggml_aarch64_get_optimal_repack_type(op->src[0])
+                ) {
+            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_F32) {
+                return true;
+            }
+            //if (op->src[1]->type == GGML_TYPE_Q8_0) {
+            //    return true;
+            //}
+        }
+        return false;
+    }
+
+    ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
+                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
+            }
+        }
+        return nullptr;
+    }
+};
+}  // namespace ggml::cpu::aarch64
+
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ nullptr,
+                           },
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+        /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),
+    };
+
+    return &ggml_backend_cpu_buffer_type_aarch64;
 }
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@ -1,32 +1,8 @@
 #pragma once

+#include "ggml-cpu-traits.h"
 #include "ggml.h"

 // GGML internal header

-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantization
-void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
-
-// GEMV
-void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-// GEMM
-void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
-
-void           ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
-enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
-
-#ifdef __cplusplus
-}
-#endif
-
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
--- a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
@ -0,0 +1,55 @@
+#ifdef GGML_USE_CPU_HBM
+
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+
+#include "ggml-cpu-hbm.h"
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                           size_t                     size) {
+    void * ptr;
+    int    result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft                 = buft;
+    buffer->iface.free_buffer    = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+                           },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
--- a/ggml/src/ggml-cpu/ggml-cpu-hbm.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.h
@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+// GGML CPU internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
--- a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
@ -0,0 +1,36 @@
+#include "ggml-cpu-traits.h"
+
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+namespace ggml::cpu {
+tensor_traits::~tensor_traits() {}
+
+extra_buffer_type::~extra_buffer_type() {}
+}  // namespace ggml::cpu
+
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra->context) {
+            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
+            auto tensor_traits = buf_extra->get_tensor_traits(op);
+            if (tensor_traits && tensor_traits->compute_forward(params, op)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra->context) {
+            auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
+            auto tensor_traits = buf_extra->get_tensor_traits(op);
+            if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
--- a/ggml/src/ggml-cpu/ggml-cpu-traits.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h
@ -0,0 +1,38 @@
+#pragma once
+#include "ggml-backend-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+#    include <vector>
+extern "C" {
+#endif
+
+// return true if op part of extra "accelerator"
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
+
+#ifdef __cplusplus
+}
+
+namespace ggml::cpu {
+// register in tensor->extra
+class tensor_traits {
+  public:
+    virtual ~tensor_traits();
+    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
+    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
+};
+
+class extra_buffer_type {
+  public:
+    virtual ~extra_buffer_type();
+    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
+    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
+};
+}  // namespace ggml::cpu
+
+// implemented in ggml-cpu.cpp.
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
+
+#endif
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -3,7 +3,7 @@

 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-cpu-aarch64.h"
+#include "ggml-cpu-traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
@ -126,8 +126,7 @@ struct ggml_arm_arch_features_type {
 #endif
 #include <windows.h>

-
-#if !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))

 typedef volatile LONG atomic_int;
@ -224,10 +223,6 @@ typedef void * thread_ret_t;

 typedef pthread_t ggml_thread_t;

-#ifdef GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
-
 #if defined(__APPLE__)
 #include <unistd.h>
 #include <mach/mach.h>
@ -301,7 +296,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
    },
    [GGML_TYPE_Q8_0] = {
        .from_float               = quantize_row_q8_0,
-        .from_float_to_mat        = quantize_mat_q8_0,
        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
        .vec_dot_type             = GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
@ -409,33 +403,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_BF16,
        .nrows                    = 1,
    },
-    [GGML_TYPE_Q4_0_4_4] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
-    },
-    [GGML_TYPE_Q4_0_4_8] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
-    },
-    [GGML_TYPE_Q4_0_8_8] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 8,
-        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
-    },
    [GGML_TYPE_TQ1_0] = {
        .from_float               = quantize_row_tq1_0,
        .vec_dot                  = ggml_vec_dot_tq1_0_q8_K,
@ -448,15 +415,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
-    [GGML_TYPE_IQ4_NL_4_4] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_iq4_nl_4x4_q8_0,
-        .gemm                     = ggml_gemm_iq4_nl_4x4_q8_0,
-    },
 };

 const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@ -496,21 +454,21 @@ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
+#define GGML_F32x4_REDUCE(res, x)                       \
+{                                                       \
+    int offset = GGML_F32_ARR >> 1;                     \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
 }

 #define GGML_F32_VEC        GGML_F32x4
@ -756,7 +714,7 @@ do {                                                              \
 #define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
 #else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
+static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
    float tmp[8];

    for (int i = 0; i < 8; i++) {
@ -1374,7 +1332,10 @@ struct ggml_compute_state {

 inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
+inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
+
 inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
@ -2425,7 +2386,7 @@ bool ggml_is_numa(void) {
 #endif

 #if !defined(HWCAP2_I8MM)
-#define HWCAP2_I8MM 0
+#define HWCAP2_I8MM (1 << 13)
 #endif

 static void ggml_init_arm_arch_features(void) {
@ -2434,7 +2395,7 @@ static void ggml_init_arm_arch_features(void) {
    uint32_t hwcap2 = getauxval(AT_HWCAP2);

    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
-    ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
+    ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);

@ -4506,9 +4467,6 @@ static void ggml_compute_forward_add(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add_q_f32(params, dst);
            } break;
@ -4886,9 +4844,6 @@ static void ggml_compute_forward_add1(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add1_q_f32(params, dst);
            } break;
@ -5016,9 +4971,6 @@ static void ggml_compute_forward_acc(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ABORT("fatal error");
@ -7434,27 +7386,9 @@ static void ggml_compute_forward_mul_mat(
    const int ith = params->ith;
    const int nth = params->nth;

-    enum ggml_type type = src0->type;
-
-    if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
-        type = (enum ggml_type)(intptr_t)src0->extra;
-    }
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-    if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
-        ggml_backend_amx_mul_mat(params, dst);
-        return;
-    }
-#endif
-
-    enum ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
+    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
-    int64_t                  const vec_dot_num_rows     = type_traits_cpu[type].nrows;
-    int64_t                  const matmul_num_cols      = type_traits_cpu[type].ncols;
-    int64_t                  const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave;
-    ggml_gemv_t              const gemv                 = type_traits_cpu[type].gemv;
-    ggml_gemm_t              const gemm                 = type_traits_cpu[type].gemm;
+    int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;

    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
@ -7462,7 +7396,7 @@ static void ggml_compute_forward_mul_mat(
    GGML_ASSERT(ne3 == ne13);

    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));

    // dst cannot be transposed or permuted
@ -7474,6 +7408,7 @@ static void ggml_compute_forward_mul_mat(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

+    // TODO: extract to "extra_op"
 #if GGML_USE_LLAMAFILE
    // broadcast factors
    const int64_t r2 = ne12 / ne02;
@ -7484,15 +7419,15 @@ static void ggml_compute_forward_mul_mat(
    if (src1_cont) {
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
+                                     nb01/ggml_type_size(src0->type),
                                     (const char *)src1->data + i12*nb12 + i13*nb13,
                                     nb11/ggml_type_size(src1->type),
                                     (char *)dst->data + i12*nb2 + i13*nb3,
                                     nb1/ggml_type_size(dst->type),
                                     ith, nth,
-                                     type,
+                                     src0->type,
                                     src1->type,
                                     dst->type))
                    goto UseGgmlGemm1;
@ -7513,19 +7448,10 @@ UseGgmlGemm1:;

        for (int64_t i13 = 0; i13 < ne13; ++i13) {
            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                int64_t i11_processed = 0;
-                if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
-                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
-                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                                          4, ne10, blck_size_interleave);
-                    }
-                    i11_processed = ne11 - ne11 % 4;
-                }
-                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                           (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                           ne10);
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                ne10);
                }
            }
        }
@ -7545,15 +7471,15 @@ UseGgmlGemm1:;

        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
+                                     nb01/ggml_type_size(src0->type),
                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                     row_size/ggml_type_size(vec_dot_type),
                                     (char *)dst->data + i12*nb2 + i13*nb3,
                                     nb1/ggml_type_size(dst->type),
                                     ith, nth,
-                                     type,
+                                     src0->type,
                                     vec_dot_type,
                                     dst->type))
                    goto UseGgmlGemm2;
@ -7595,28 +7521,6 @@ UseGgmlGemm2:;
    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;

-    if ((ggml_n_dims(src0) == 2) && gemv) {
-        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
-        int64_t src0_start = (ith * ne01) / nth;
-        int64_t src0_end   = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
-        src0_end   = (src0_end   % matmul_num_cols) ? src0_end   + matmul_num_cols - (src0_end   % matmul_num_cols): src0_end;
-        if (src0_start >= src0_end) return;
-
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (gemm && (ne11 > 3)) {
-            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
-                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        }
-        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                 src0_end - src0_start);
-        }
-        return;
-    }
-
    // The first chunk comes from our thread_id, the rest will get auto-assigned.
    int current_chunk = ith;

@ -7639,7 +7543,7 @@ UseGgmlGemm2:;
            num_rows_per_vec_dot = 1;
        }

-        ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+        ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);

        if (nth >= nchunk0 * nchunk1) {
            break;
@ -7671,8 +7575,6 @@ static void ggml_compute_forward_mul_mat_id(
    ggml_vec_dot_t    const vec_dot         = type_traits_cpu[type].vec_dot;
    enum ggml_type    const vec_dot_type    = type_traits_cpu[type].vec_dot_type;
    ggml_from_float_t const from_float      = type_traits_cpu[vec_dot_type].from_float;
-    int64_t           const matmul_num_cols = type_traits_cpu[type].ncols;
-    ggml_gemv_t       const gemv            = type_traits_cpu[type].gemv;

    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
@ -7758,34 +7660,6 @@ static void ggml_compute_forward_mul_mat_id(
        const int64_t nr0 = ne01; // src0 rows
        const int64_t nr1 = cne1; // src1 rows

-        if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
-            int64_t src0_cur_start = (ith * ne01) / nth;
-            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
-            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
-            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
-            if (src0_cur_start >= src0_cur_end) return;
-
-            for (int ir1 = 0; ir1 < nr1; ir1++) {
-                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
-                const int id       = row_mapping.i1; // selected expert index
-
-                const int64_t  i11 = id % ne11;
-                const int64_t  i12 = row_mapping.i2; // row index in src1
-
-                const int64_t  i1 = id;  // selected expert index
-                const int64_t  i2 = i12; // row
-
-                const char * src1_col = (const char *) wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                    ? (i11        + i12 * ne11) * row_size
-                    : (i11 * nb11 + i12 * nb12));
-
-                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
-                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
-            }
-            continue;
-        }
-
        // distribute the thread work across the inner or outer loop based on which one is larger

        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
@ -8093,9 +7967,6 @@ static void ggml_compute_forward_out_prod(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_out_prod_q_f32(params, dst);
            } break;
@ -8248,6 +8119,77 @@ static void ggml_compute_forward_set_f32(
    }
 }

+static void ggml_compute_forward_set_i32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
+
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                ggml_nbytes(dst));
+        }
+        ggml_barrier(params->threadpool);
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src1);
+    const int nc = src1->ne[0];
+
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+    // src0 and dst as viewed during set
+    const size_t nb0 = ggml_element_size(src0);
+
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+
+    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
+
+    GGML_ASSERT(nb10 == sizeof(int32_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+
+        ggml_vec_cpy_i32(nc,
+                (int32_t *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
+
 static void ggml_compute_forward_set(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
@ -8259,6 +8201,10 @@ static void ggml_compute_forward_set(
            {
                ggml_compute_forward_set_f32(params, dst);
            } break;
+        case GGML_TYPE_I32:
+            {
+                ggml_compute_forward_set_i32(params, dst);
+            } break;
        case GGML_TYPE_F16:
        case GGML_TYPE_BF16:
        case GGML_TYPE_Q4_0:
@ -8283,9 +8229,6 @@ static void ggml_compute_forward_set(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ABORT("fatal error");
@ -8547,9 +8490,6 @@ static void ggml_compute_forward_get_rows(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_get_rows_q(params, dst);
            } break;
@ -9139,10 +9079,6 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
-        case GGML_TYPE_IQ4_NL_4_4:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
@ -10439,6 +10375,40 @@ static void ggml_compute_forward_pad(
    }
 }

+// ggml_compute_forward_pad_reflect_1d
+
+static void ggml_compute_forward_pad_reflect_1d(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t * opts = (const int32_t *) dst->op_params;
+    const int p0 = opts[0];
+    const int p1 = opts[1];
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                float * left  = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 +         p0*nb0);
+                float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
+
+                ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
+
+                for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0];   }
+                for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
+            }
+        }
+    }
+}

 // ggml_compute_forward_arange

@ -12314,6 +12284,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
        return;
    }

+    // extra_buffer op?
+    if (ggml_cpu_extra_compute_forward(params, tensor)) return;
+
    switch (tensor->op) {
        case GGML_OP_DUP:
            {
@ -12535,6 +12508,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_pad(params, tensor);
            } break;
+        case GGML_OP_PAD_REFLECT_1D:
+            {
+                ggml_compute_forward_pad_reflect_1d(params, tensor);
+            } break;
        case GGML_OP_ARANGE:
            {
                ggml_compute_forward_arange(params, tensor);
@ -12877,6 +12854,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
+        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_ARGSORT:
@ -12966,7 +12944,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
 #include "windows.h"

 // TODO: support > 64 CPUs
-bool ggml_thread_apply_affinity(bool * mask) {
+static bool ggml_thread_apply_affinity(bool * mask) {
    HANDLE    h = GetCurrentThread();
    uint64_t  bitmask = 0ULL;

@ -13256,146 +13234,142 @@ struct ggml_cplan ggml_graph_plan(

        size_t cur = 0;

-        switch (node->op) {
-            case GGML_OP_CPY:
-            case GGML_OP_DUP:
-                {
-                    if (ggml_is_quantized(node->type) ||
-                        // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
-                        (node->src[0]->type == GGML_TYPE_F16  && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
-                        (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
+        if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) {
+
+            switch (node->op) {
+                case GGML_OP_CPY:
+                case GGML_OP_DUP:
+                    {
+                        if (ggml_is_quantized(node->type) ||
+                            // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
+                            (node->src[0]->type == GGML_TYPE_F16  && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
+                            (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_ADD:
+                case GGML_OP_ADD1:
+                    {
+                        if (ggml_is_quantized(node->src[0]->type)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_ACC:
+                    {
+                        if (ggml_is_quantized(node->src[0]->type)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_COUNT_EQUAL:
+                    {
+                        cur = ggml_type_size(node->type)*n_tasks;
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
+
+                        if (node->src[1]->type != vec_dot_type) {
+                            cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
+                        }
+                    } break;
+                case GGML_OP_MUL_MAT_ID:
+                    {
+                        cur = 0;
+                        const struct ggml_tensor * src0 = node->src[0];
+                        const struct ggml_tensor * src1 = node->src[1];
+                        const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
+                        if (src1->type != vec_dot_type) {
+                            cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
+                        }
+                        const int n_as = src0->ne[2];
+                        cur += GGML_PAD(cur, sizeof(int64_t));       // align
+                        cur += n_as * sizeof(int64_t);               // matrix_row_counts
+                        cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
+                    } break;
+                case GGML_OP_OUT_PROD:
+                    {
+                        if (ggml_is_quantized(node->src[0]->type)) {
+                            cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                case GGML_OP_ROPE:
+                    {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ADD:
-            case GGML_OP_ADD1:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ACC:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_COUNT_EQUAL:
-                {
-                    cur = ggml_type_size(node->type)*n_tasks;
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-                    if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
-                        cur = ggml_backend_amx_desired_wsize(node);
-                    }
-#endif
-                    const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
+                    } break;
+                case GGML_OP_CONV_TRANSPOSE_1D:
+                    {
+                        GGML_ASSERT(node->src[0]->ne[3] == 1);
+                        GGML_ASSERT(node->src[1]->ne[2] == 1);
+                        GGML_ASSERT(node->src[1]->ne[3] == 1);

-                    if (node->src[1]->type != vec_dot_type) {
-                        size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
-                        cur = MAX(cur, cur2);
-                    }
-                } break;
-            case GGML_OP_MUL_MAT_ID:
-                {
-                    cur = 0;
-                    const struct ggml_tensor * src0 = node->src[0];
-                    const struct ggml_tensor * src1 = node->src[1];
-                    const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
-                    if (src1->type != vec_dot_type) {
-                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
-                    }
-                    const int n_as = src0->ne[2];
-                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
-                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
-                } break;
-            case GGML_OP_OUT_PROD:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_SOFT_MAX:
-            case GGML_OP_ROPE:
-                {
-                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_1D:
-                {
-                    GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
+                        const int64_t ne00 = node->src[0]->ne[0];  // K
+                        const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                        const int64_t ne02 = node->src[0]->ne[2];  // Cin
+                        const int64_t ne10 = node->src[1]->ne[0];  // L
+                        const int64_t ne11 = node->src[1]->ne[1];  // Cin

-                    const int64_t ne00 = node->src[0]->ne[0];  // K
-                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
-                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
+                        if ((node->src[0]->type == GGML_TYPE_F16 ||
+                             node->src[0]->type == GGML_TYPE_BF16) &&
+                            node->src[1]->type == GGML_TYPE_F32) {
+                            cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
+                            cur += sizeof(ggml_fp16_t)*ne10*ne11;
+                        } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                                   node->src[1]->type == GGML_TYPE_F32) {
+                            cur += sizeof(float)*ne00*ne01*ne02;
+                            cur += sizeof(float)*ne10*ne11;
+                        } else {
+                            GGML_ABORT("fatal error");
+                        }
+                    } break;
+                case GGML_OP_CONV_TRANSPOSE_2D:
+                    {
+                        const int64_t ne00 = node->src[0]->ne[0]; // W
+                        const int64_t ne01 = node->src[0]->ne[1]; // H
+                        const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+                        const int64_t ne03 = node->src[0]->ne[3]; // Channels In

-                    const int64_t ne10 = node->src[1]->ne[0];  // L
-                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
+                        const int64_t ne10 = node->src[1]->ne[0]; // W
+                        const int64_t ne11 = node->src[1]->ne[1]; // H
+                        const int64_t ne12 = node->src[1]->ne[2]; // Channels In

-                    if ((node->src[0]->type == GGML_TYPE_F16 ||
-                         node->src[0]->type == GGML_TYPE_BF16) &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
-                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(float)*ne00*ne01*ne02;
-                        cur += sizeof(float)*ne10*ne11;
-                    } else {
+                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
+                        cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
+                    } break;
+                case GGML_OP_FLASH_ATTN_EXT:
+                    {
+                        const int64_t ne00 = node->src[0]->ne[0]; // D
+
+                        cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
+                    } break;
+                case GGML_OP_FLASH_ATTN_BACK:
+                    {
+                        const int64_t    D = node->src[0]->ne[0];
+                        const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
+                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                        if (node->src[1]->type == GGML_TYPE_F32) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        } else if (node->src[1]->type == GGML_TYPE_F16) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        } else if (node->src[1]->type == GGML_TYPE_BF16) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        }
+                    } break;
+
+                case GGML_OP_CROSS_ENTROPY_LOSS:
+                    {
+                        cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+                    } break;
+                case GGML_OP_COUNT:
+                    {
                        GGML_ABORT("fatal error");
                    }
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_2D:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
-                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-
-                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
-                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
-                } break;
-            case GGML_OP_FLASH_ATTN_EXT:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // D
-
-                    cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
-                } break;
-            case GGML_OP_FLASH_ATTN_BACK:
-                {
-                    const int64_t    D = node->src[0]->ne[0];
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_BF16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-
-            case GGML_OP_CROSS_ENTROPY_LOSS:
-                {
-                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-                } break;
-            case GGML_OP_COUNT:
-                {
-                    GGML_ABORT("fatal error");
-                }
-            default:
-                break;
+                default:
+                    break;
+            }
        }

        work_size = MAX(work_size, cur);
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -2,12 +2,18 @@
 #include "ggml-backend-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-cpu-aarch64.h"
+#include "ggml-cpu-traits.h"
 #include "ggml-impl.h"
 #include "amx/amx.h"
+
 #include <cctype>
 #include <string>
 #include <vector>

+#ifdef GGML_USE_CPU_HBM
+#include "ggml-cpu-hbm.h"
+#endif
+
 #if defined(__APPLE__)
 #include <sys/types.h>
 #include <sys/sysctl.h>
@ -23,115 +29,7 @@

 // ggml-backend interface

-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-// buffer type AARCH64
-
-static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
-
-    ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_AARCH64";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-
-    if (buffer == NULL) {
-        return NULL;
-    }
-
-    buffer->buft = buft;
-    buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
-    buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ NULL,
-        },
-        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_aarch64;
-}
-
-bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
-    return buft == ggml_backend_cpu_aarch64_buffer_type();
-}
-
-static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
    static std::vector<ggml_backend_buffer_type_t> bufts = []() {
        std::vector<ggml_backend_buffer_type_t> bufts;

@ -152,11 +50,22 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
        return bufts;
    }();

-    return bufts.data();
+    return bufts;
+}
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
+    return ggml_backend_cpu_get_extra_buffers_type().data();

    GGML_UNUSED(device);
 }

+static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) return true;
+    }
+    return false;
+}
+
 // CPU backend - backend (stream)

 struct ggml_backend_cpu_context {
@ -465,25 +374,19 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
        return true;
    }

-    if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
-        if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
-            return false;
+    // extra_buffer_op?
+    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra) {
+            auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
+            if (buf_extra && buf_extra->supports_op(dev, op)) {
+                return true;
+            }
        }
    }

-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-    if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
-        return ggml_backend_amx_device_supports_op(op);
-    }
-    for (int i = 1; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
-            return false;
-        }
-    }
-#endif
-
-    for (int i = 1; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
+    // the other case need host buffer.
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
            return false;
        }
    }
@ -506,19 +409,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
        default:
            return true;
    }
-
-    GGML_UNUSED(dev);
 }

 static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-    supported = supported || ggml_backend_amx_buft_is_amx(buft);
-#endif
-
-    return supported;
-
+    return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
    GGML_UNUSED(dev);
 }

@ -641,7 +535,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_llamafile()) {
            features.push_back({ "LLAMAFILE", "1" });
        }
-        // TODO: rename this
+    #ifdef GGML_USE_ACCELERATE
+        features.push_back({ "ACCELERATE", "1" });
+    #endif
+    #ifdef GGML_USE_CPU_HBM
+        features.push_back({ "CPU_HBM", "1" });
+    #endif
+    #ifdef GGML_USE_OPENMP
+        features.push_back({ "OPENMP", "1" });
+    #endif
    #ifdef GGML_USE_CPU_AARCH64
        features.push_back({ "AARCH64_REPACK", "1" });
    #endif
@ -658,10 +560,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r

 static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
-        return (void *)ggml_backend_cpu_set_n_threads;
+        ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
+        return (void *)fct;
    }
    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
-        return (void *)ggml_backend_cpu_get_extra_bufts;
+        ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
+        return (void *)fct;
    }
    if (strcmp(name, "ggml_backend_get_features") == 0) {
        return (void *)ggml_backend_cpu_get_features;
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -41,28 +41,28 @@
 #define CUDART_HMAX   11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 #define CUDART_HMASK  12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons

-#define CC_PASCAL     600
-#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define CC_VOLTA      700
-#define CC_TURING     750
-#define CC_AMPERE     800
-#define CC_OFFSET_AMD 1000000
+#define GGML_CUDA_CC_PASCAL     600
+#define GGML_CUDA_CC_DP4A       610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define GGML_CUDA_CC_VOLTA      700
+#define GGML_CUDA_CC_TURING     750
+#define GGML_CUDA_CC_AMPERE     800
+#define GGML_CUDA_CC_OFFSET_AMD 1000000

 // GCN/CNDA, wave size is 64
-#define CC_GCN4       (CC_OFFSET_AMD + 803)  // Tonga, Fiji, Polaris, minimum for fast fp16
-#define CC_VEGA       (CC_OFFSET_AMD + 900)  // Vega56/64, minimum for fp16 dual issue
-#define CC_VEGA20     (CC_OFFSET_AMD + 906)  // MI50/Radeon VII, minimum for dp4a
-#define CC_CDNA       (CC_OFFSET_AMD + 908)  // MI100, minimum for MFMA, acc registers
-#define CC_CDNA2      (CC_OFFSET_AMD + 910)  // MI210, minimum acc register renameing
-#define CC_CDNA3      (CC_OFFSET_AMD + 942)  // MI300
+#define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 803)  // Tonga, Fiji, Polaris, minimum for fast fp16
+#define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 900)  // Vega56/64, minimum for fp16 dual issue
+#define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 906)  // MI50/Radeon VII, minimum for dp4a
+#define GGML_CUDA_CC_CDNA       (GGML_CUDA_CC_OFFSET_AMD + 908)  // MI100, minimum for MFMA, acc registers
+#define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 910)  // MI210, minimum acc register renameing
+#define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 942)  // MI300

 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
-#define CC_RDNA1      (CC_OFFSET_AMD + 1010) // RX 5000
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
-#define CC_RDNA3      (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
+#define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
+#define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA

-#define CC_QY1        210
-#define CC_QY2        220
+#define GGML_CUDA_CC_QY1        210
+#define GGML_CUDA_CC_QY2        220

 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses

@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif // GGML_CUDA_F16

-#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 #define FP16_AVAILABLE
-#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL

 #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 #define FAST_FP16_AVAILABLE
 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
 #define FP16_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA

-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 #define INT8_MMA_AVAILABLE
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING

-#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
+#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 #define FLASH_ATTN_AVAILABLE
-#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
+#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)

 static constexpr bool fast_fp16_available(const int cc) {
-    return cc >= CC_PASCAL && cc != 610;
+    return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
 }

 static constexpr bool fp16_mma_available(const int cc) {
-    return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
+    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
 }

 static constexpr bool int8_mma_available(const int cc) {
-    return cc < CC_OFFSET_AMD && cc >= CC_TURING;
+    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
 }

 [[noreturn]]
@ -187,7 +187,7 @@ static __device__ void no_device_code(
 #endif // __CUDA_ARCH__

 static __device__ __forceinline__ int warp_reduce_sum(int x) {
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
    return __reduce_add_sync(0xffffffff, x);
 #else
 #pragma unroll
@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
        x += __shfl_xor_sync(0xffffffff, x, offset, 32);
    }
    return x;
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 }

 static __device__ __forceinline__ float warp_reduce_sum(float x) {
@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
 }

 static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 #pragma unroll
   for (int offset = 16; offset > 0; offset >>= 1) {
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 #else
   GGML_UNUSED(x);
   NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 }

 #if CUDART_VERSION < CUDART_HMASK
@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i

 #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)

-#if __CUDA_ARCH__ >= MIN_CC_DP4A
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
    return __dp4a(a, b, c);
-#else // __CUDA_ARCH__ >= MIN_CC_DP4A
+#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
    const int8_t * a8 = (const int8_t *) &a;
    const int8_t * b8 = (const int8_t *) &b;
    return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A

 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 }
--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@ -94,7 +94,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
 }

 // non-contiguous kernel (slow)
-static __global__ void concat_f32_non_cont(
+template <int dim>
+static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
+    concat_f32_non_cont(
        const char * src0,
        const char * src1,
              char * dst,
@ -121,22 +123,28 @@ static __global__ void concat_f32_non_cont(
          uint64_t   nb0,
          uint64_t   nb1,
          uint64_t   nb2,
-          uint64_t   nb3,
-          int32_t   dim) {
+          uint64_t   nb3){
+    static_assert(dim >= 0 && dim <= 3);
+
    const int64_t i3 = blockIdx.z;
    const int64_t i2 = blockIdx.y;
    const int64_t i1 = blockIdx.x;

-    int64_t o[4] = {0, 0, 0, 0};
-    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
-
    const float * x;

-    for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
+    for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
            x = (const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
        } else {
-            x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
+            if constexpr (dim == 0) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
+            } else if constexpr (dim == 1) {
+                x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
+            } else if constexpr (dim == 2) {
+                x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
+            } else if constexpr (dim == 3) {
+                x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
+            }
        }

        float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -182,15 +190,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
        }
    } else {
        dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
-        concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
-                (const char *)src0->data,
-                (const char *)src1->data,
-                (      char *)dst->data,
+        auto launch_kernel = [&](auto dim) {
+            concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
+                (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
                src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
-                dst->ne[0],  dst->ne[1],  dst->ne[2],  dst->ne[3],
-                dst->nb[0],  dst->nb[1],  dst->nb[2],  dst->nb[3], dim);
+                dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+        };
+        switch (dim) {
+            case 0:
+                launch_kernel(std::integral_constant<int, 0>{});
+                break;
+            case 1:
+                launch_kernel(std::integral_constant<int, 1>{});
+                break;
+            case 2:
+                launch_kernel(std::integral_constant<int, 2>{});
+                break;
+            case 3:
+                launch_kernel(std::integral_constant<int, 3>{});
+                break;
+            default:
+                GGML_ABORT("Invalid dim: %d", dim);
+                break;
+        }
    }
 }
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __

 template <bool need_check>
 static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
-#if __CUDA_ARCH__ >= CC_PASCAL
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;

    const int64_t   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
    GGML_UNUSED(y);
    GGML_UNUSED(k);
    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= CC_PASCAL
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 }

 template<typename dst_t>
@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
        case GGML_TYPE_Q5_1:
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
-            if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
+            if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
                return dequantize_block_q8_0_f16_cuda;
            }
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@ -220,7 +220,6 @@ static __global__ void flash_attn_vec_ext_f16(
        for (int j = 0; j < ncols; ++j) {
            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];

-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@ -206,7 +206,6 @@ static __global__ void flash_attn_vec_ext_f32(
        for (int j = 0; j < ncols; ++j) {
            float kqmax_new_j = kqmax_new_arr[j];

-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
            if (threadIdx.x == 0) {
                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
            }
--- a/Show More
+++ b/Show More