diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index cd8f87b2ea..83182c9700 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -4,7 +4,7 @@ # Define the CANN base image for easier version updates later ARG CHIP_TYPE=910b -ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11 +ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11 # ============================================================================== # BUILD STAGE @@ -111,7 +111,7 @@ ENTRYPOINT ["/app/tools.sh"] # ============================================================================== FROM base AS light -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index 6e16ecda44..b9e84ab986 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 54f793d0a3..fed5863157 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -74,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index d1a8fbed4c..adebf08229 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -73,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"] FROM base AS light COPY --from=build /app/lib/ /app -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index faa3500e61..34d6ad9f40 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index d6bf28b105..53c3ed8d88 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -94,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index b7c9457680..1e66f061d5 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -105,7 +105,7 @@ WORKDIR /llama.cpp/bin # Copy llama.cpp binaries and libraries COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin -COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin +COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ] diff --git a/.devops/tools.sh b/.devops/tools.sh index 8a3a693400..cc5ee17dfd 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then exec ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then exec ./llama-cli "$@" +elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then + exec ./llama-completion "$@" elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then exec ./llama-bench "$@" elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then @@ -32,8 +34,10 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then else echo "Unknown command: $arg1" echo "Available commands: " - echo " --run (-r): Run a model previously converted into ggml" - echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" + echo " --run (-r): Run a model (chat) previously converted into ggml" + echo " ex: -m /models/7B/ggml-model-q4_0.bin" + echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " --bench (-b): Benchmark the performance of the inference for various parameters." echo " ex: -m model.gguf" echo " --perplexity (-p): Measure the perplexity of a model over a given text." diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index fd7195c5be..b37b4f277d 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -68,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 383427f36f..9fe1401df4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1400,25 +1400,54 @@ jobs: chip_type: ['910b', '310p'] build: ['Release'] runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} - container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }} steps: - name: Checkout uses: actions/checkout@v4 + with: + fetch-depth: 0 - - name: Dependencies + - name: Free up disk space + uses: ggml-org/free-disk-space@v1.3.1 + with: + tool-cache: true + + - name: Set container image + id: cann-image run: | - yum update -y - yum install -y git gcc gcc-c++ make cmake libcurl-devel + image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}" + echo "image=${image}" >> "${GITHUB_OUTPUT}" + + - name: Pull container image + run: docker pull "${{ steps.cann-image.outputs.image }}" - name: Build + env: + BUILD_TYPE: ${{ matrix.build }} + SOC_TYPE: ascend${{ matrix.chip_type }} run: | - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} + HOST_UID=$(id -u) + HOST_GID=$(id -g) - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ - -DGGML_CANN=on \ - -DSOC_TYPE=ascend${{ matrix.chip_type }} - cmake --build build -j $(nproc) + docker run --rm \ + -v "${PWD}:/workspace" \ + -w /workspace \ + -e SOC_TYPE=${SOC_TYPE} \ + -e BUILD_TYPE=${BUILD_TYPE} \ + "${{ steps.cann-image.outputs.image }}" \ + bash -lc ' + set -e + yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel + yum clean all && rm -rf /var/cache/yum + git config --global --add safe.directory "/workspace" + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DGGML_CANN=on \ + -DSOC_TYPE=${SOC_TYPE} + cmake --build build -j $(nproc) + + chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build + ' # TODO: simplify the following workflows using a matrix # TODO: run lighter CI on PRs and the full CI only on master (if needed) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 77aec20c11..446cae9f84 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -731,6 +731,78 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz + + openEuler-cann: + strategy: + matrix: + arch: [x86, aarch64] + chip_type: ['910b', '310p'] + build: ['Release'] + runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Free up disk space + uses: ggml-org/free-disk-space@v1.3.1 + with: + tool-cache: true + + - name: Set container image + id: cann-image + run: | + image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}" + echo "image=${image}" >> "${GITHUB_OUTPUT}" + + - name: Pull container image + run: docker pull "${{ steps.cann-image.outputs.image }}" + + - name: Build + env: + BUILD_TYPE: ${{ matrix.build }} + SOC_TYPE: ascend${{ matrix.chip_type }} + run: | + HOST_UID=$(id -u) + HOST_GID=$(id -g) + + docker run --rm \ + -v "${PWD}:/workspace" \ + -w /workspace \ + -e SOC_TYPE=${SOC_TYPE} \ + -e BUILD_TYPE=${BUILD_TYPE} \ + "${{ steps.cann-image.outputs.image }}" \ + bash -lc ' + set -e + yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel + yum clean all && rm -rf /var/cache/yum + git config --global --add safe.directory "/workspace" + export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DGGML_CANN=on \ + -DSOC_TYPE=${SOC_TYPE} + cmake --build build -j $(nproc) + + chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build + ' + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + run: | + cp LICENSE ./build/bin/ + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + + - name: Upload artifacts (tar) + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz + name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz + release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -752,6 +824,7 @@ jobs: - macOS-arm64 - macOS-x64 - ios-xcode-build + - openEuler-cann steps: - name: Clone @@ -844,6 +917,12 @@ jobs: - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip) - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip) + **openEuler:** + - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz) + - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz) + - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz) + - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz) + - name: Upload release id: upload_release uses: actions/github-script@v3 diff --git a/common/arg.cpp b/common/arg.cpp index a31dcbc689..19f22f883f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) { bool common_arg::get_value_from_env(std::string & output) const { if (env == nullptr) return false; + if (!args_neg.empty()) { + // for compatibility, we need to check LLAMA_ARG_NO_ env as well + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + char * neg_value = std::getenv(neg_env.c_str()); + if (neg_value) { + output = "0"; // falsey + return true; + } + } char * value = std::getenv(env); if (value) { output = value; @@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const { } bool common_arg::has_value_from_env() const { + if (env != nullptr && !args_neg.empty()) { + // for compatibility, we need to check LLAMA_ARG_NO_ env as well + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + if (std::getenv(neg_env.c_str())) { + return true; + } + } return env != nullptr && std::getenv(env); } @@ -151,9 +169,10 @@ std::string common_arg::to_string() const { std::string leading_spaces(n_leading_spaces, ' '); std::ostringstream ss; - for (const auto arg : args) { - if (arg == args.front()) { - if (args.size() == 1) { + auto all_args = get_args(); // also contains args_neg + for (const auto & arg : all_args) { + if (arg == all_args.front()) { + if (all_args.size() == 1) { ss << arg; } else { // first arg is usually abbreviation, we need padding to make it more beautiful @@ -162,7 +181,7 @@ std::string common_arg::to_string() const { ss << tmp << spaces; } } else { - ss << arg << (arg != args.back() ? ", " : ""); + ss << arg << (arg != all_args.back() ? ", " : ""); } } if (value_hint) ss << " " << value_hint; @@ -181,6 +200,31 @@ std::string common_arg::to_string() const { return ss.str(); } +std::vector common_arg::get_args() const { + std::vector result; + for (const auto & arg : args) { + result.push_back(std::string(arg)); + } + for (const auto & arg : args_neg) { + result.push_back(std::string(arg)); + } + return result; +} + +std::vector common_arg::get_env() const { + std::vector result; + if (env) { + result.push_back(std::string(env)); + } + if (!args_neg.empty() && env) { + // for compatibility, we need to add LLAMA_ARG_NO_ variant + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + result.push_back(neg_env); + } + return result; +} + // // utils // @@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() { return msg.str(); } +static bool parse_bool_value(const std::string & value) { + if (is_truthy(value)) { + return true; + } else if (is_falsey(value)) { + return false; + } else { + throw std::invalid_argument("invalid boolean value"); + } +} + // // CLI argument parsing functions // @@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; - std::unordered_map arg_to_options; + std::unordered_map> arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { - arg_to_options[arg] = &opt; + arg_to_options[arg] = {&opt, /* is_positive */ true}; + } + for (const auto & arg : opt.args_neg) { + arg_to_options[arg] = {&opt, /* is_positive */ false}; } } @@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context std::string value; if (opt.get_value_from_env(value)) { try { - if (opt.handler_void && (value == "1" || value == "true")) { + if (opt.handler_void && is_truthy(value)) { opt.handler_void(params); } if (opt.handler_int) { opt.handler_int(params, std::stoi(value)); } + if (opt.handler_bool) { + opt.handler_bool(params, parse_bool_value(value)); + } if (opt.handler_string) { opt.handler_string(params, value); continue; @@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context if (arg_to_options.find(arg) == arg_to_options.end()) { throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } - auto opt = *arg_to_options[arg]; + auto & tmp = arg_to_options[arg]; + auto opt = *tmp.first; + bool is_positive = tmp.second; if (opt.has_value_from_env()) { fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); } @@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context opt.handler_void(params); continue; } + if (opt.handler_bool) { + opt.handler_bool(params, is_positive); + continue; + } // arg with single value check_arg(i); @@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + arg.c_str(), e.what(), opt.to_string().c_str())); } } @@ -438,7 +504,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // model is required (except for server) // TODO @ngxson : maybe show a list of available models in CLI in this case - if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) { + if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) { throw std::invalid_argument("error: --model is required\n"); } @@ -573,6 +639,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-batched-bench", "llama-bench", "llama-cli", + "llama-completion", "llama-convert-llama2c-to-ggml", "llama-cvector-generator", "llama-embedding", @@ -750,11 +817,11 @@ static std::string list_builtin_chat_templates() { } bool common_arg_utils::is_truthy(const std::string & value) { - return value == "on" || value == "enabled" || value == "1"; + return value == "on" || value == "enabled" || value == "true" || value == "1"; } bool common_arg_utils::is_falsey(const std::string & value) { - return value == "off" || value == "disabled" || value == "0"; + return value == "off" || value == "disabled" || value == "false" || value == "0"; } bool common_arg_utils::is_autoy(const std::string & value) { @@ -839,10 +906,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( + {"--display-prompt"}, {"--no-display-prompt"}, - string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](common_params & params) { - params.display_prompt = false; + string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"), + [](common_params & params, bool value) { + params.display_prompt = value; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( @@ -1055,18 +1123,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.kv_unified = true; } ).set_env("LLAMA_ARG_KV_UNIFIED")); - add_opt(common_arg( - {"--no-context-shift"}, - string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](common_params & params) { - params.ctx_shift = false; - } - ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--context-shift"}, - string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), - [](common_params & params) { - params.ctx_shift = true; + {"--no-context-shift"}, + string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.ctx_shift = value; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); add_opt(common_arg( @@ -1106,20 +1168,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( + {"--perf"}, {"--no-perf"}, - string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](common_params & params) { - params.no_perf = true; - params.sampling.no_perf = true; + string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](common_params & params, bool value) { + params.no_perf = !value; + params.sampling.no_perf = !value; } - ).set_env("LLAMA_ARG_NO_PERF")); + ).set_env("LLAMA_ARG_PERF")); add_opt(common_arg( + {"--show-timings"}, {"--no-show-timings"}, - string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"), - [](common_params & params) { - params.show_timings = false; + string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"), + [](common_params & params, bool value) { + params.show_timings = value; } - ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS")); + ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", @@ -1171,16 +1235,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, - string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](common_params & params) { - params.escape = true; - } - )); - add_opt(common_arg( {"--no-escape"}, - "do not process escape sequences", - [](common_params & params) { - params.escape = false; + string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](common_params & params, bool value) { + params.escape = value; } )); add_opt(common_arg( @@ -1227,19 +1285,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, - "run in conversation mode:\n" + {"-no-cnv", "--no-conversation"}, + "whether to run in conversation mode:\n" "- does not print special tokens and suffix/prefix\n" "- interactive mode is also enabled\n" "(default: auto enabled if chat template is available)", - [](common_params & params) { - params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; - } - ).set_examples({LLAMA_EXAMPLE_COMPLETION})); - add_opt(common_arg( - {"-no-cnv", "--no-conversation"}, - "force disable conversation mode (default: false)", - [](common_params & params) { - params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + [](common_params & params, bool value) { + params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( @@ -1297,10 +1349,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( + {"--warmup"}, {"--no-warmup"}, - "skip warming up the model with an empty run", - [](common_params & params) { - params.warmup = false; + string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.warmup = value; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( @@ -1702,19 +1755,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( + {"-kvo", "--kv-offload"}, {"-nkvo", "--no-kv-offload"}, - "disable KV offload", - [](common_params & params) { - params.no_kv_offload = true; + string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"), + [](common_params & params, bool value) { + params.no_kv_offload = !value; } - ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); + ).set_env("LLAMA_ARG_KV_OFFLOAD")); add_opt(common_arg( + {"--repack"}, {"-nr", "--no-repack"}, - "disable weight repacking", - [](common_params & params) { - params.no_extra_bufts = true; + string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"), + [](common_params & params, bool value) { + params.no_extra_bufts = !value; } - ).set_env("LLAMA_ARG_NO_REPACK")); + ).set_env("LLAMA_ARG_REPACK")); add_opt(common_arg( {"--no-host"}, "bypass host buffer allowing extra buffers to be used", @@ -1843,20 +1898,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, - string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](common_params & params) { - params.cont_batching = true; + {"-nocb", "--no-cont-batching"}, + string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.cont_batching = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(common_arg( - {"-nocb", "--no-cont-batching"}, - "disable continuous batching", - [](common_params & params) { - params.cont_batching = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); - add_opt(common_arg( - {"--mmproj"}, "FILE", + {"-mm", "--mmproj"}, "FILE", "path to a multimodal projector file. see tools/mtmd/README.md\n" "note: if -hf is used, this argument can be omitted", [](common_params & params, const std::string & value) { @@ -1864,26 +1913,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ")); add_opt(common_arg( - {"--mmproj-url"}, "URL", + {"-mmu", "--mmproj-url"}, "URL", "URL to a multimodal projector file. see tools/mtmd/README.md", [](common_params & params, const std::string & value) { params.mmproj.url = value; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL")); add_opt(common_arg( - {"--no-mmproj"}, - "explicitly disable multimodal projector, useful when using -hf", - [](common_params & params) { - params.no_mmproj = true; + {"--mmproj-auto"}, + {"--no-mmproj", "--no-mmproj-auto"}, + string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"), + [](common_params & params, bool value) { + params.no_mmproj = !value; } - ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ")); + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO")); add_opt(common_arg( + {"--mmproj-offload"}, {"--no-mmproj-offload"}, - "do not offload multimodal projector to GPU", - [](common_params & params) { - params.mmproj_use_gpu = false; + string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.mmproj_use_gpu = value; } - ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD")); add_opt(common_arg( {"--image", "--audio"}, "FILE", "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", @@ -1923,12 +1974,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( + {"--mmap"}, {"--no-mmap"}, - "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](common_params & params) { - params.use_mmap = false; + string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.use_mmap = value; } - ).set_env("LLAMA_ARG_NO_MMAP")); + ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" @@ -2116,10 +2168,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( + {"--op-offload"}, {"--no-op-offload"}, - string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), - [](common_params & params) { - params.no_op_offload = true; + string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"), + [](common_params & params, bool value) { + params.no_op_offload = !value; } )); add_opt(common_arg( @@ -2315,10 +2368,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( + {"--ppl"}, {"--no-ppl"}, - string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](common_params & params) { - params.compute_ppl = false; + string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](common_params & params, bool value) { + params.compute_ppl = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( @@ -2437,12 +2491,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); add_opt(common_arg( + {"--webui"}, {"--no-webui"}, - string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), - [](common_params & params) { - params.webui = false; + string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.webui = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), @@ -2547,18 +2602,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(common_arg( {"--slots"}, - string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_slots = true; + {"--no-slots"}, + string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.endpoint_slots = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); - add_opt(common_arg( - {"--no-slots"}, - "disables slots monitoring endpoint", - [](common_params & params) { - params.endpoint_slots = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", @@ -2609,26 +2658,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); add_opt(common_arg( + {"--models-autoload"}, {"--no-models-autoload"}, - "disables automatic loading of models (default: enabled)", - [](common_params & params) { - params.models_autoload = false; + string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.models_autoload = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD")); add_opt(common_arg( {"--jinja"}, - string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"), - [](common_params & params) { - params.use_jinja = true; + {"--no-jinja"}, + string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.use_jinja = value; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); - add_opt(common_arg( - {"--no-jinja"}, - string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"), - [](common_params & params) { - params.use_jinja = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" @@ -2673,15 +2717,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); add_opt(common_arg( + {"--prefill-assistant"}, {"--no-prefill-assistant"}, string_format( "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" ), - [](common_params & params) { - params.prefill_assistant = false; + [](common_params & params, bool value) { + params.prefill_assistant = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), diff --git a/common/arg.h b/common/arg.h index 219c115e63..6db38da488 100644 --- a/common/arg.h +++ b/common/arg.h @@ -16,6 +16,7 @@ struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; std::set excludes = {}; std::vector args; + std::vector args_neg; // for negated args like --no-xxx const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value const char * env = nullptr; @@ -25,6 +26,7 @@ struct common_arg { void (*handler_string) (common_params & params, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; void (*handler_int) (common_params & params, int) = nullptr; + void (*handler_bool) (common_params & params, bool) = nullptr; common_arg() = default; @@ -48,6 +50,13 @@ struct common_arg { void (*handler)(common_params & params) ) : args(args), help(help), handler_void(handler) {} + common_arg( + const std::initializer_list & args, + const std::initializer_list & args_neg, + const std::string & help, + void (*handler)(common_params & params, bool) + ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {} + // support 2 values for arg common_arg( const std::initializer_list & args, @@ -80,6 +89,10 @@ struct common_arg { } return strcmp(args[0], other.args[0]) == 0; } + + // get all args and env vars (including negated args/env) + std::vector get_args() const; + std::vector get_env() const; }; namespace common_arg_utils { diff --git a/common/download.cpp b/common/download.cpp index ab68c53b43..ef87472560 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -472,36 +474,79 @@ std::pair> common_remote_get_content(const std::string & #elif defined(LLAMA_USE_HTTPLIB) -static bool is_output_a_tty() { +class ProgressBar { + static inline std::mutex mutex; + static inline std::map lines; + static inline int max_line = 0; + + static void cleanup(const ProgressBar * line) { + lines.erase(line); + if (lines.empty()) { + max_line = 0; + } + } + + static bool is_output_a_tty() { #if defined(_WIN32) - return _isatty(_fileno(stdout)); + return _isatty(_fileno(stdout)); #else - return isatty(1); + return isatty(1); #endif -} - -static void print_progress(size_t current, size_t total) { - if (!is_output_a_tty()) { - return; } - if (!total) { - return; +public: + ProgressBar() = default; + + ~ProgressBar() { + std::lock_guard lock(mutex); + cleanup(this); } - size_t width = 50; - size_t pct = (100 * current) / total; - size_t pos = (width * current) / total; + void update(size_t current, size_t total) { + if (!is_output_a_tty()) { + return; + } - std::cout << "[" - << std::string(pos, '=') - << (pos < width ? ">" : "") - << std::string(width - pos, ' ') - << "] " << std::setw(3) << pct << "% (" - << current / (1024 * 1024) << " MB / " - << total / (1024 * 1024) << " MB)\r"; - std::cout.flush(); -} + if (!total) { + return; + } + + std::lock_guard lock(mutex); + + if (lines.find(this) == lines.end()) { + lines[this] = max_line++; + std::cout << "\n"; + } + int lines_up = max_line - lines[this]; + + size_t width = 50; + size_t pct = (100 * current) / total; + size_t pos = (width * current) / total; + + std::cout << "\033[s"; + + if (lines_up > 0) { + std::cout << "\033[" << lines_up << "A"; + } + std::cout << "\033[2K\r[" + << std::string(pos, '=') + << (pos < width ? ">" : "") + << std::string(width - pos, ' ') + << "] " << std::setw(3) << pct << "% (" + << current / (1024 * 1024) << " MB / " + << total / (1024 * 1024) << " MB) " + << "\033[u"; + + std::cout.flush(); + + if (current == total) { + cleanup(this); + } + } + + ProgressBar(const ProgressBar &) = delete; + ProgressBar & operator=(const ProgressBar &) = delete; +}; static bool common_pull_file(httplib::Client & cli, const std::string & resolve_path, @@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli, const char * func = __func__; // avoid __func__ inside a lambda size_t downloaded = existing_size; size_t progress_step = 0; + ProgressBar bar; auto res = cli.Get(resolve_path, headers, [&](const httplib::Response &response) { @@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli, progress_step += len; if (progress_step >= total_size / 1000 || downloaded == total_size) { - print_progress(downloaded, total_size); + bar.update(downloaded, total_size); progress_step = 0; } return true; @@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli, nullptr ); - std::cout << "\n"; - if (!res) { LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1); return false; diff --git a/common/preset.cpp b/common/preset.cpp index 09ac171b72..729c27f2cf 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -23,8 +23,14 @@ std::vector common_preset::to_args() const { if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { // flag option, no value if (common_arg_utils::is_falsey(value)) { - // skip the flag - args.pop_back(); + // use negative arg if available + if (!opt.args_neg.empty()) { + args.back() = opt.args_neg.back(); + } else { + // otherwise, skip the flag + // TODO: maybe throw an error instead? + args.pop_back(); + } } } if (opt.value_hint != nullptr) { @@ -141,10 +147,10 @@ static std::map> parse_ini_from_ static std::map get_map_key_opt(common_params_context & ctx_params) { std::map mapping; for (const auto & opt : ctx_params.options) { - if (opt.env != nullptr) { - mapping[opt.env] = opt; + for (const auto & env : opt.get_env()) { + mapping[env] = opt; } - for (const auto & arg : opt.args) { + for (const auto & arg : opt.get_args()) { mapping[rm_leading_dashes(arg)] = opt; } } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 143fd82ee0..80262d141c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7370,6 +7370,10 @@ class DeepseekV2Model(TextModel): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) @@ -10131,6 +10135,10 @@ class MistralMoeModel(DeepseekV2Model): MistralModel.set_mistral_config(self.gguf_writer, self.hparams) yarn_params = self.hparams["yarn"] self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) + + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): diff --git a/docs/docker.md b/docs/docker.md index 98502a0c50..b9e5015396 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -56,7 +56,7 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /model or with a server image: ```bash -docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 ``` ## Docker With CUDA @@ -91,7 +91,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne ```bash docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` ## Docker With MUSA @@ -125,5 +125,5 @@ After building locally, Usage is similar to the non-MUSA examples, but you'll ne ```bash docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 -docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 420195f198..e9f7bf9313 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) { static void write_table_entry(std::ofstream & file, const common_arg & opt) { file << "| `"; // args - for (const auto & arg : opt.args) { - if (arg == opt.args.front()) { + auto all_args = opt.get_args(); + for (const auto & arg : all_args) { + if (arg == all_args.front()) { file << arg; - if (opt.args.size() > 1) file << ", "; + if (all_args.size() > 1) file << ", "; } else { - file << arg << (arg != opt.args.back() ? ", " : ""); + file << arg << (arg != all_args.back() ? ", " : ""); } } // value hint diff --git a/examples/model-conversion/scripts/causal/compare-logits.py b/examples/model-conversion/scripts/causal/compare-logits.py index afa0d5b263..894302c69e 100755 --- a/examples/model-conversion/scripts/causal/compare-logits.py +++ b/examples/model-conversion/scripts/causal/compare-logits.py @@ -1,10 +1,13 @@ #!/usr/bin/env python3 -import numpy as np import sys -import os +import numpy as np from pathlib import Path +# Add utils directory to path for direct script execution +sys.path.insert(0, str(Path(__file__).parent.parent / "utils")) +from common import get_model_name_from_env_path # type: ignore[import-not-found] + def quick_logits_check(pytorch_file, llamacpp_file): """Lightweight sanity check before NMSE""" @@ -32,27 +35,16 @@ def quick_logits_check(pytorch_file, llamacpp_file): print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}") print(f"Max absolute difference: {max_diff:.4f}") - if max_diff > 1.0: - print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}") - return False - return True def main(): - model_path = os.getenv('MODEL_PATH') - if not model_path: - print("Error: MODEL_PATH environment variable not set") - sys.exit(1) - - if not os.path.exists(model_path): - print(f"Error: Model file not found: {model_path}") - sys.exit(1) - - model_name = os.path.basename(model_path) + model_name = get_model_name_from_env_path('MODEL_PATH') data_dir = Path("data") - pytorch_file = data_dir / f"pytorch-{model_name}.bin" - llamacpp_file = data_dir / f"llamacpp-{model_name}.bin" + + llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL') + print(f"Using converted model: {llamacpp_model_name}") + llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin" if not pytorch_file.exists(): print(f"Error: PyTorch logits file not found: {pytorch_file}") diff --git a/examples/model-conversion/scripts/utils/__init__.py b/examples/model-conversion/scripts/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/model-conversion/scripts/utils/check-nmse.py b/examples/model-conversion/scripts/utils/check-nmse.py index 939e3153cc..83f63f9ff3 100755 --- a/examples/model-conversion/scripts/utils/check-nmse.py +++ b/examples/model-conversion/scripts/utils/check-nmse.py @@ -5,6 +5,7 @@ import sys import os import argparse from pathlib import Path +from common import get_model_name_from_env_path # type: ignore[import-not-found] def calculate_nmse(reference, test): mse = np.mean((test - reference) ** 2) @@ -67,11 +68,13 @@ def main(): parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory') args = parser.parse_args() - model_name = os.path.basename(args.model_path) + model_name = get_model_name_from_env_path('MODEL_PATH') data_dir = Path("data") pytorch_file = data_dir / f"pytorch-{model_name}.bin" - llamacpp_file = data_dir / f"llamacpp-{model_name}.bin" + + llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL') + llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin" print(f"Model name: {model_name}") print(f"PyTorch logits file: {pytorch_file}") diff --git a/examples/model-conversion/scripts/utils/common.py b/examples/model-conversion/scripts/utils/common.py new file mode 100644 index 0000000000..945f9a1a1d --- /dev/null +++ b/examples/model-conversion/scripts/utils/common.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import os +import sys + +def get_model_name_from_env_path(env_path_name): + model_path = os.getenv(env_path_name) + if not model_path: + print(f"Error: {env_path_name} environment variable not set") + sys.exit(1) + + if not os.path.exists(model_path): + print(f"Error: Model file not found: {model_path}") + sys.exit(1) + + name = os.path.basename(os.path.normpath(model_path)) + if name.endswith(".gguf"): + name = name[:-5] + + return name diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index a8e53f28eb..0d11d0f803 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -255,6 +255,8 @@ int main(int argc, char ** argv) { LOG_INF("target:\n\n"); common_perf_print(ctx_tgt, smpl); + llama_batch_free(batch_tgt); + common_sampler_free(smpl); common_speculative_free(spec); diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 9edd485136..4f3b99c8d0 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -99,6 +99,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sme (void); // other GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); + GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index a5995fdc2c..ec16cbda9f 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -312,16 +312,9 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al } // this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) { +static void ggml_dyn_tallocr_free_bytes(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size) { size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", - __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks); - -#ifdef GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, addr, tensor); -#endif - struct tallocr_chunk * chunk = alloc->chunks[addr.chunk]; // see if we can merge with an existing block @@ -357,8 +350,6 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct } // otherwise, add a new block ggml_dyn_tallocr_insert_block(chunk, addr.offset, size); - - GGML_UNUSED(tensor); } static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { @@ -616,13 +607,17 @@ static void ggml_gallocr_free_extra_space(ggml_gallocr_t galloc, struct ggml_ten GGML_ASSERT(parent_size >= node_size); + // note: we want after the freeing the chunks to continue to be aligned + struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id]; + parent_size = aligned_offset(NULL, parent_size, p_alloc->alignment); + node_size = aligned_offset(NULL, node_size, p_alloc->alignment); + if (parent_size > node_size) { - struct ggml_dyn_tallocr * p_alloc = galloc->buf_tallocs[p_hn->buffer_id]; struct buffer_address p_addr = p_hn->addr; p_addr.offset += node_size; size_t extra_size = parent_size - node_size; AT_PRINTF("freeing extra %zu bytes from parent %s for %s\n", extra_size, parent->name, node->name); - ggml_dyn_tallocr_free_tensor(p_alloc, p_addr, extra_size, parent); + ggml_dyn_tallocr_free_bytes(p_alloc, p_addr, extra_size); } } @@ -706,7 +701,14 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; size_t size = ggml_backend_buft_get_alloc_size(buft, node); - ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node); + + AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", + __func__, node->name, hn->addr.chunk, hn->addr.offset, size, alloc->chunks[hn->addr.chunk]->n_free_blocks); +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, hn->addr, node); +#endif + + ggml_dyn_tallocr_free_bytes(alloc, hn->addr, size); hn->allocated = false; } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 81288464c7..da624c587c 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2548,6 +2548,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten case GGML_OP_ARGSORT: case GGML_OP_ACC: case GGML_OP_GROUP_NORM: + return true; case GGML_OP_PAD: // TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985 return ggml_get_op_params_i32(op, 8) == 0; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c47511adcb..a59b518938 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -81,6 +81,11 @@ struct ggml_arm_arch_features_type { } ggml_arm_arch_features = { 0 }; #endif +#if defined(__riscv) +struct ggml_riscv_arch_features_type { + int rvv_vlen; +} ggml_riscv_arch_features = { 0 }; +#endif #if defined(_WIN32) @@ -703,6 +708,15 @@ static void ggml_init_arm_arch_features(void) {} #endif #endif // __ARM_ARCH +#if defined(__riscv) && defined(__riscv_v_intrinsic) +#include +static void ggml_init_riscv_arch_features(void) { + ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb(); +} +#else +static void ggml_init_riscv_arch_features(void) {} +#endif + struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { GGML_ASSERT(!ggml_get_no_alloc(ctx)); @@ -3459,6 +3473,14 @@ int ggml_cpu_has_riscv_v(void) { #endif } +int ggml_cpu_get_rvv_vlen(void) { +#if defined(__riscv) && defined(__riscv_v_intrinsic) + return ggml_riscv_arch_features.rvv_vlen; +#else + return 0; +#endif +} + int ggml_cpu_has_f16c(void) { #if defined(__F16C__) return 1; @@ -3625,6 +3647,10 @@ void ggml_cpu_init(void) { ggml_init_arm_arch_features(); #endif +#if defined(__riscv) + ggml_init_riscv_arch_features(); +#endif + is_first_call = false; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 3191faaa4c..f4713a4218 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_riscv_v()) { features.push_back({ "RISCV_V", "1" }); } + if (ggml_cpu_get_rvv_vlen() > 0) { + static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen()); + features.push_back({ "RVV_VLEN", rvv_vlen.c_str() }); + } if (ggml_cpu_has_vsx()) { features.push_back({ "VSX", "1" }); } diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 9f0d449bd6..b70ea7d78b 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -2169,7 +2169,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons static const ggml::cpu::repack::tensor_traits iq4_nl_8x8_q8_0; if (cur->type == GGML_TYPE_Q4_0) { - if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) + || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) { if (cur->ne[1] % 8 == 0) { return &q4_0_8x8_q8_0; } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index c4529f5d94..9fcb2f9fd2 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -67,19 +67,22 @@ #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000 #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA +#define GGML_CUDA_CC_RDNA3_5 (GGML_CUDA_CC_OFFSET_AMD + 0x1150) // AI 370, AI Max 395 laptops. #define GGML_CUDA_CC_RDNA4 (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000 -#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) -#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) -#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) -#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) -#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4) -#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) -#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) -#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) -#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) -#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) -#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_AMD(cc) (cc >= GGML_CUDA_CC_OFFSET_AMD) +#define GGML_CUDA_CC_IS_RDNA(cc) (cc >= GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2) +#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3) +#define GGML_CUDA_CC_IS_RDNA3_0(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA3_5) +#define GGML_CUDA_CC_IS_RDNA3_5(cc) (cc >= GGML_CUDA_CC_RDNA3_5 && cc < GGML_CUDA_CC_RDNA4) +#define GGML_CUDA_CC_IS_RDNA3(cc) (GGML_CUDA_CC_IS_RDNA3_0(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) +#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4) +#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1) +#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1) +#define GGML_CUDA_CC_IS_CDNA1(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_CDNA2) +#define GGML_CUDA_CC_IS_CDNA2(cc) (cc >= GGML_CUDA_CC_CDNA2 && cc < GGML_CUDA_CC_CDNA3) +#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1) // Moore Threads #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 2750117aa9..8dc82a9d3b 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup( const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; - const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; - const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const bool did_not_have_any_data = kbc0 == kbc0_stop; const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; @@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup( int bidx = bidx0 - 1; int kbc_stop = kbc0; while(true) { - const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; if (kbc == kbc_stop) { // Did not have any data. bidx--; kbc_stop = kbc; diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index d51537f7d0..7bd1044c19 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16( const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1; // kbc == k block continuous, current index in continuous ijk space. - int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; - const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined. // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup). @@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0); const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio)); const half * mask_h = ncols2 == 1 && !mask ? nullptr : - (const half *) (mask + nb33*(sequence % ne33)); + (const half *) (mask + nb33*(sequence % ne33)); float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2); const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8d17bc669a..ab0f6fe9ce 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4630,9 +4630,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CUMSUM: case GGML_OP_TRI: case GGML_OP_DIAG: - return true; case GGML_OP_SOLVE_TRI: - return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32; + return true; + default: return false; } diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 0b13293da9..dcfa40f4d5 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -189,6 +189,9 @@ namespace ggml_cuda_mma { return 8 * (threadIdx.x / 16) + l; #elif defined(RDNA3) return 2 * l + (threadIdx.x / 16); +#else + NO_DEVICE_CODE; + return -1; #endif // defined(RDNA4) } else { NO_DEVICE_CODE; @@ -290,8 +293,12 @@ namespace ggml_cuda_mma { } } #elif defined(AMD_WMMA_AVAILABLE) - +#if defined(RDNA3) + // RDNA3 has duplicated data as input. + static constexpr int ne = I * J / 32 * 2; +#else static constexpr int ne = I * J / 32; +#endif // defined(RDNA3) half2 x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { @@ -310,7 +317,14 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 16 && J == 8) { +#if defined(RDNA4) return 4 * (threadIdx.x / 16) + l; +#elif defined(RDNA3) + return l; +#else + NO_DEVICE_CODE; + return -1; +#endif // defined(RDNA4) } else { NO_DEVICE_CODE; return -1; @@ -366,11 +380,16 @@ namespace ggml_cuda_mma { static constexpr int I = I_; static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR; - static constexpr int ne = I * J / WARP_SIZE; - - nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; #if defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3) + // RDNA3 has duplicated data as input. + static constexpr int ne = I * J / 32 * 2; +#else + static constexpr int ne = I * J / 32; +#endif // defined(RDNA3) + nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; + static constexpr __device__ bool supported() { if (I == 16 && J == 8) return true; return false; @@ -387,13 +406,23 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 16 && J == 8) { +#if defined(RDNA4) return 4 * (threadIdx.x / 16) + l; +#elif defined(RDNA3) + return l; +#else + NO_DEVICE_CODE; + return -1; +#endif // defined(RDNA4) } else { NO_DEVICE_CODE; return -1; } } #else + static constexpr int ne = I * J / WARP_SIZE; + nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; + static constexpr __device__ bool supported() { if (I == 8 && J == 8) return true; if (I == 16 && J == 4) return true; @@ -546,8 +575,14 @@ namespace ggml_cuda_mma { } #elif defined(AMD_WMMA_AVAILABLE) if constexpr (std::is_same_v || std::is_same_v) { - ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); - +#if defined(RDNA4) + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); +#elif defined(RDNA3) + ggml_cuda_memcpy_1(t.x, xs0 + t.get_i(0) * stride + t.get_j(0)); + ggml_cuda_memcpy_1(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2)); +#else + NO_DEVICE_CODE; +#endif // defined(RDNA4) } else if constexpr (std::is_same_v) { if constexpr (I == 16 && J == 4) { int64_t * xi = (int64_t *) t.x; @@ -888,6 +923,16 @@ namespace ggml_cuda_mma { const halfx8_t& a_frag = reinterpret_cast(A.x[0]); const halfx8_t& b_frag = reinterpret_cast(B.x[0]); acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag); +#elif defined(RDNA3) + using halfx16_t = __attribute__((ext_vector_type(16))) _Float16; + using floatx8_t = __attribute__((ext_vector_type(8))) float; + floatx8_t& acc_frag = reinterpret_cast(D.x[0]); + const halfx16_t& a_frag = reinterpret_cast(A.x[0]); + const halfx16_t& b_frag = reinterpret_cast(B.x[0]); + acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, acc_frag); +#else + GGML_UNUSED_VARS(D, A, B); + NO_DEVICE_CODE; #endif // RDNA4 #else GGML_UNUSED_VARS(D, A, B); @@ -905,6 +950,16 @@ namespace ggml_cuda_mma { const bf16x8_t& a_frag = reinterpret_cast(A.x[0]); const bf16x8_t& b_frag = reinterpret_cast(B.x[0]); acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag); +#elif defined(RDNA3) + using bf16x16_t = __attribute__((ext_vector_type(16))) __bf16; + using floatx8_t = __attribute__((ext_vector_type(8))) float; + floatx8_t& acc_frag = reinterpret_cast(D.x[0]); + const bf16x16_t& a_frag = reinterpret_cast(A.x[0]); + const bf16x16_t& b_frag = reinterpret_cast(B.x[0]); + acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, acc_frag); +#else + GGML_UNUSED_VARS(D, A, B); + NO_DEVICE_CODE; #endif // RDNA4 #else GGML_UNUSED_VARS(D, A, B); diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index 7cf33f0ddf..6643f243b1 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -151,7 +151,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const return false; } } else { - if (src1_ncols > 16) { + if (GGML_CUDA_CC_IS_RDNA3_0(cc) && src1_ncols > 8) { + return false; + } else if (src1_ncols > 16) { return false; } } @@ -160,9 +162,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const case GGML_TYPE_F32: return ampere_mma_available(cc); case GGML_TYPE_F16: - return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc)); + return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc); case GGML_TYPE_BF16: - return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc)); + return ampere_mma_available(cc) || amd_wmma_available(cc); default: return false; } diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 6238ce7ebd..32948e4d7a 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -765,7 +765,10 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0 return ne11 <= 8; } else if (GGML_CUDA_CC_IS_AMD(cc)) { if (fp16_mma_hardware_available(cc)) { - if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc)) { + return ne11 <= 3; + } + if (GGML_CUDA_CC_IS_RDNA4(cc)) { return ne11 <= 5; } return ne11 <= 2; diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu index e161d4dc43..177ffc268f 100644 --- a/ggml/src/ggml-cuda/solve_tri.cu +++ b/ggml/src/ggml-cuda/solve_tri.cu @@ -3,6 +3,80 @@ #include "solve_tri.cuh" #define MAX_N_FAST 64 +#define MAX_K_FAST 32 + +static __global__ void get_batch_pointers(const float * A, + float * X, + const float ** A_ptrs, + float ** X_ptrs, + int64_t ne02, + int64_t total_batches, + size_t s02, + size_t s03, + size_t s2, + size_t s3) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total_batches) { + return; + } + + const int64_t i3 = idx / ne02; + const int64_t i2 = idx % ne02; + + A_ptrs[idx] = A + i3 * s03 + i2 * s02; + X_ptrs[idx] = X + i3 * s3 + i2 * s2; +} + +static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx, + const float * A, + const float * B, + float * X, + int n, + int k, + int64_t ne02, + int64_t ne03, + size_t s02, + size_t s03, + size_t s12, + size_t s13, + size_t s2, + size_t s3, + cudaStream_t stream) { + const float alpha = 1.0f; + const int64_t total_batches = ne02 * ne03; + if (total_batches == 0) { + return; + } + + // Bulk copy B -> X (contiguous tensors) + if (X != B) { + const int64_t total_elements_BX = n * k * total_batches; + CUDA_CHECK(cudaMemcpyAsync(X, B, total_elements_BX * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + } + + const int id = ggml_cuda_get_device(); + + ggml_cuda_pool_alloc A_ptrs_alloc(ctx.pool(id), total_batches); + ggml_cuda_pool_alloc X_ptrs_alloc(ctx.pool(id), total_batches); + + const float ** A_ptrs_dev = A_ptrs_alloc.get(); + float ** X_ptrs_dev = X_ptrs_alloc.get(); + + get_batch_pointers<<<(total_batches + 255) / 256, 256, 0, stream>>>(A, X, A_ptrs_dev, X_ptrs_dev, ne02, + total_batches, s02, s03, s2, s3); + + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); + + // Yes, this is necessary, without this we get RMSE errors + CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_DEFAULT_MATH)); + CUBLAS_CHECK(cublasStrsmBatched(ctx.cublas_handle(id), CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, k, n, &alpha, A_ptrs_dev, n, X_ptrs_dev, k, total_batches)); + + // revert to standard mode from common.cuh + CUBLAS_CHECK(cublasSetMathMode(ctx.cublas_handle(id), CUBLAS_TF32_TENSOR_OP_MATH)); + + GGML_UNUSED_VARS(s12, s13); +} // ====================== // Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction @@ -63,7 +137,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f; float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f; - const int half = WARP_SIZE; + const int half = WARP_SIZE; const int nrows_low = (n < half) ? n : half; #pragma unroll @@ -81,8 +155,8 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, #pragma unroll for (int row = half; row < n; ++row) { - float sum = sA[row * n + lane] * x_low; - const int j = half + lane; + float sum = sA[row * n + lane] * x_low; + const int j = half + lane; if (j < row) { sum += sA[row * n + j] * x_high; } @@ -97,7 +171,7 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A, for (int rr = 0; rr < 2; ++rr) { const int row = rr * WARP_SIZE + lane; if (row < n) { - const float val = (row < half) ? x_low : x_high; + const float val = (row < half) ? x_low : x_high; X_batch[row * k + col_idx] = val; } } @@ -176,20 +250,26 @@ static void solve_tri_f32_cuda(const float * A, } void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix) - const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns) + const ggml_tensor * src0 = dst->src[0]; // A (n×n, lower triangular) + const ggml_tensor * src1 = dst->src[1]; // B (n×k) ggml_is_contiguous(src0); ggml_is_contiguous(src1); - const int64_t n = src0->ne[0]; - const int64_t k = src1->ne[0]; + const int64_t n = src0->ne[0]; + const int64_t k = src1->ne[0]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; - GGML_ASSERT(n <= 64); - GGML_ASSERT(k <= 32); - - solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2], - src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float), - src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float), - dst->nb[3] / sizeof(float), ctx.stream()); + if (n <= MAX_N_FAST && k <= MAX_K_FAST) { + solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, + src0->ne[2], src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float), + src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float), + dst->nb[3] / sizeof(float), ctx.stream()); + } else { + solve_tri_f32_cublas(ctx, (const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, + ne02, ne03, src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float), + src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float), + dst->nb[3] / sizeof(float), ctx.stream()); + } } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index b7d6edf7fc..951a88d567 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -19,6 +19,9 @@ #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_16BF HIPBLAS_R_16B #define CUDA_R_32F HIPBLAS_R_32F +#define CUBLAS_SIDE_RIGHT HIPBLAS_SIDE_RIGHT +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER +#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned @@ -30,6 +33,7 @@ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define __all_sync(mask, var) __all(var) #define __any_sync(mask, var) __any(var) +#define cublasStrsmBatched hipblasStrsmBatched #define cublasCreate hipblasCreate #define cublasDestroy hipblasDestroy #define cublasGemmEx hipblasGemmEx diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 8c55a2e4e5..221e67f96a 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -12,11 +12,16 @@ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT #define CUBLAS_OP_N MUBLAS_OP_N #define CUBLAS_OP_T MUBLAS_OP_T +#define CUBLAS_DEFAULT_MATH MUBLAS_DEFAULT_MATH +#define CUBLAS_SIDE_RIGHT MUBLAS_SIDE_RIGHT +#define CUBLAS_FILL_MODE_UPPER MUBLAS_FILL_MODE_UPPER +#define CUBLAS_DIAG_NON_UNIT MUBLAS_DIAG_NON_UNIT #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH #define CUDA_R_16F MUSA_R_16F #define CUDA_R_16BF MUSA_R_16BF #define CUDA_R_32F MUSA_R_32F +#define cublasStrsmBatched mublasStrsmBatched #define cublasComputeType_t cudaDataType_t #define cublasCreate mublasCreate #define cublasDestroy mublasDestroy diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c6f5809ccd..34ec09d403 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -659,6 +659,7 @@ struct vk_device_struct { vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_log[2]; vk_pipeline pipeline_tri[2]; + vk_pipeline pipeline_diag[2]; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_roll_f32; @@ -722,6 +723,11 @@ struct vk_device_struct { vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; vk_pipeline pipeline_soft_max_back_f32; + + vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16; + vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16; + vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16; + vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16; @@ -757,7 +763,8 @@ struct vk_device_struct { vk_pipeline pipeline_flash_attn_split_k_reduce; - vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT]; + // [2] is for whether to take n_experts from spec constant (0) or push constant (1) + vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2]; std::vector all_pipelines; @@ -1149,6 +1156,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256); struct vk_op_topk_moe_push_constants { uint32_t n_rows; + uint32_t n_experts_push; uint32_t n_expert_used; float clamp_min; float clamp_max; @@ -3730,6 +3738,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); @@ -3917,6 +3926,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); @@ -3996,6 +4008,13 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); @@ -4204,10 +4223,12 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); - for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + for (uint32_t use_push = 0; use_push < 2; ++use_push) { + for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + } } for (auto &c : compiles) { @@ -8274,6 +8295,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const switch (op) { case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); + if (src0->type == GGML_TYPE_I32) { + // i32 src only supports i32 result + GGML_ASSERT(dst->type == GGML_TYPE_I32); + return ctx->device->pipeline_get_rows[src0->type]; + } if (dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_get_rows[src0->type]; } @@ -8400,6 +8426,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16]; } return nullptr; + case GGML_OP_DIAG: + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16]; + } + return nullptr; case GGML_OP_CLAMP: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_clamp_f32; @@ -8554,7 +8586,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); GGML_ASSERT(idx < num_topk_moe_pipelines); topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops); - return ctx->device->pipeline_topk_moe[idx][mode]; + // use n_experts from push constant if it's not equal to the power of two spec constant + bool use_push = dst->ne[0] != (1u << idx); + return ctx->device->pipeline_topk_moe[idx][mode][use_push]; } if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { @@ -9091,6 +9125,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_COS: case GGML_OP_LOG: case GGML_OP_TRI: + case GGML_OP_DIAG: case GGML_OP_CLAMP: case GGML_OP_PAD: case GGML_OP_ROLL: @@ -9778,6 +9813,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p)); } +static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p)); +} + static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); @@ -10111,7 +10152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, { + vk_op_soft_max_push_constants pc { ncols, src1 != nullptr ? nrows_y : (uint32_t)0, (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], @@ -10122,7 +10163,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, n_head_log2, nrows_x, src2 != nullptr - }); + }; + + if (ncols <= 16384) { + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc)); + } else { + + vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a; + vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a; + vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst); + + uint32_t elems_per_wg = 128 * 4; + uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg); + size_t tmp_size = num_wgs * nrows_x * sizeof(float); + + if (ctx->prealloc_size_x < tmp_size) { + ctx->prealloc_size_x = tmp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_size_y < tmp_size) { + ctx->prealloc_size_y = tmp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size }; + vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size }; + + std::array elements = { num_wgs, nrows_x, 1 }; + + vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32; + vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32; + vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32; + + ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + + ctx->prealloc_x_need_sync = true; + ctx->prealloc_y_need_sync = true; + } } static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -10158,6 +10247,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, vk_op_topk_moe_push_constants pc {}; pc.n_rows = n_rows; + pc.n_experts_push = n_experts; pc.n_expert_used = n_expert_used; if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; @@ -11857,6 +11947,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TRI: ggml_vk_tri(ctx, compute_ctx, src0, node); + break; + case GGML_OP_DIAG: + ggml_vk_diag(ctx, compute_ctx, src0, node); + break; case GGML_OP_CLAMP: ggml_vk_clamp(ctx, compute_ctx, src0, node); @@ -12832,8 +12926,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc } const int n_expert = softmax->ne[0]; - // n_expert must be a power of 2 - if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) { + if (n_expert > (1 << (num_topk_moe_pipelines-1))) { return false; } @@ -13877,6 +13970,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: + case GGML_TYPE_I32: return true; default: return false; @@ -14001,6 +14095,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_LOG: case GGML_OP_TRI: + case GGML_OP_DIAG: return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && op->type == op->src[0]->type; case GGML_OP_ARGSORT: @@ -14591,6 +14686,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * tensor_clone = ggml_log(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_TRI) { tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0)); + } else if (tensor->op == GGML_OP_DIAG) { + tensor_clone = ggml_diag(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_CLAMP) { const float * params = (const float *)tensor->op_params; tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp new file mode 100644 index 0000000000..cd3f42f491 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp @@ -0,0 +1,29 @@ +#version 450 + +#include "rte.glsl" +#include "types.glsl" +#include "generic_unary_head.glsl" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12_offset = i12*p.ne11*p.ne10; + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; + + if (i10 == i11) { + const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]); + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val); + } else { + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp index 76d83041ce..e88bdd057e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp @@ -26,9 +26,9 @@ void main() { const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23; #if defined(DATA_A_BF16) - FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00])); + TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00])); #else - FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]); + TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]); #endif #ifndef OPTIMIZATION_ERROR_WORKAROUND data_d[d_offset + i00] = D_TYPE(v); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp new file mode 100644 index 0000000000..39c4663912 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp @@ -0,0 +1,62 @@ +#version 450 + +#include "soft_max_large_common.glsl" + +void main() { + const uint tid = gl_LocalInvocationID.x; + const uint rowx = gl_WorkGroupID.y; + const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } + + if (rowx >= p.nrows_x) { + return; + } + + float slope = get_slope(rowx); + + // Find max + FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02]; + + [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { + const uint col = col0 + tid; + + FLOAT_TYPE a = FLOAT_TYPE(0); + if (col < p.KX) { + a = data_a[rowx * p.KX + col]; + } + + FLOAT_TYPE b = FLOAT_TYPE(0); + if (p.KY > 0 && col < p.KX) { + b = data_b[rowy_start + col]; + } + + FLOAT_TYPE v = a * p.scale + slope * b; + + if (col < p.KX) { + max_val = max(max_val, v); + } + } + + // reduce across the workgroup + vals[tid] = max_val; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] = max(vals[tid], vals[tid + s]); + } + barrier(); + } + + if (tid == 0) { + max_val = vals[0]; + data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp new file mode 100644 index 0000000000..69524f5f75 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp @@ -0,0 +1,79 @@ +#version 450 + +#include "soft_max_large_common.glsl" + +void main() { + const uint tid = gl_LocalInvocationID.x; + const uint rowx = gl_WorkGroupID.y; + const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } + + if (rowx >= p.nrows_x) { + return; + } + + float slope = get_slope(rowx); + + // Find max + FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02]; + + [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) { + if (i + tid < gl_NumWorkGroups.x) { + max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]); + } + } + + // reduce across the workgroup + vals[tid] = max_val; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] = max(max_val, vals[tid + s]); + } + barrier(); + } + + max_val = vals[0]; + barrier(); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); + + // Compute sum{exp(x - max)} + [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { + const uint col = col0 + tid; + + if (col >= p.KX) { + break; + } + + // compute exp(a*scale+b*slope), add it to sum + const uint i = rowx * p.KX + col; + FLOAT_TYPE val; + val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val); + sum += val; + data_d[i] = D_TYPE(val); + } + + // reduce across the workgroup + vals[tid] = sum; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] += vals[tid + s]; + } + barrier(); + } + + if (tid == 0) { + sum = vals[0]; + data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp new file mode 100644 index 0000000000..06efd7d9fb --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp @@ -0,0 +1,65 @@ +#version 450 + +#include "soft_max_large_common.glsl" + +shared FLOAT_TYPE sumsh[BLOCK_SIZE]; + +void main() { + const uint tid = gl_LocalInvocationID.x; + const uint rowx = gl_WorkGroupID.y; + const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } + + if (rowx >= p.nrows_x) { + return; + } + + FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02]; + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); + + [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) { + if (i + tid < gl_NumWorkGroups.x) { + max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]); + sum += data_s[rowx * gl_NumWorkGroups.x + i + tid]; + } + } + + // reduce across the workgroup + vals[tid] = max_val; + sumsh[tid] = sum; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] = max(max_val, vals[tid + s]); + sumsh[tid] += sumsh[tid + s]; + } + barrier(); + } + + max_val = vals[0]; + sum = sumsh[0]; + + if (p.has_sinks != 0) { + sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val)); + } + + FLOAT_TYPE rcpdivisor = 1.0/sum; + + [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { + const uint col = col0 + tid; + + if (col >= p.KX) { + continue; + } + + data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl new file mode 100644 index 0000000000..6636d1f8de --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl @@ -0,0 +1,53 @@ +#extension GL_EXT_control_flow_attributes : enable + +layout (push_constant) uniform parameter +{ + uint KX; + uint KY; + uint ne00; + uint ne01; + uint ne02; + uint ne12; + uint ne13; + uint nb11; + uint nb12; + uint nb13; + float scale; + float max_bias; + float m0; + float m1; + uint n_head_log2; + uint nrows_x; + uint has_sinks; +} p; + +#include "types.glsl" + +layout(constant_id = 0) const uint BLOCK_SIZE = 128; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +layout(constant_id = 1) const uint num_iters = 4; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer Y {B_TYPE data_b[];}; +layout (binding = 2) readonly buffer Z {float data_c[];}; +layout (binding = 3) buffer D {D_TYPE data_d[];}; +layout (binding = 4) buffer M {float data_m[];}; +layout (binding = 5) buffer S {float data_s[];}; + +shared FLOAT_TYPE vals[BLOCK_SIZE]; + +float get_slope(uint rowx) { + float slope = 1.0f; + + // ALiBi + if (p.max_bias > 0.0f) { + const uint h = (rowx / p.ne01) % p.ne02; // head index + + const float base = h < p.n_head_log2 ? p.m0 : p.m1; + const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1; + + slope = pow(base, exp); + } + + return slope; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp index 5cd0785d20..b83a2b9d2d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp @@ -10,6 +10,7 @@ layout (push_constant) uniform parameter { uint n_rows; + uint n_experts_push; uint n_expert_used; float clamp_min; float clamp_max; @@ -18,11 +19,16 @@ layout (push_constant) uniform parameter layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; layout(constant_id = 0) const uint WARP_SIZE = 32; -layout(constant_id = 1) const uint n_experts = 512; +layout(constant_id = 1) const uint n_experts_spec = 512; layout(constant_id = 2) const bool with_norm = true; layout(constant_id = 3) const bool late_softmax = false; +layout(constant_id = 4) const bool nexperts_use_push = false; -const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1; +uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec; + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + +const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE); layout (binding = 0, std430) readonly buffer Logits {float logits[];}; layout (binding = 1, std430) writeonly buffer Weights {float weights[];}; @@ -94,7 +100,7 @@ void main() { } if (!late_softmax) { - softmax_warp_inplace(wt, n_experts, lane, false); + softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push); } // at this point, each thread holds a portion of softmax, diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 92bae088b2..b0ade078c7 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -704,13 +704,15 @@ void process_shaders() { shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp"; if (tname == "f16") { - string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); } else { - string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); } - string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); + string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); } + string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}}); + string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}); string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}); string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}); @@ -854,6 +856,8 @@ void process_shaders() { string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); @@ -899,6 +903,13 @@ void process_shaders() { string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); + string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}}); string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}}); string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}}); diff --git a/pyrightconfig.json b/pyrightconfig.json index 5320fe5864..a7bc007bdc 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,5 +1,5 @@ { - "extraPaths": ["gguf-py"], + "extraPaths": ["gguf-py", "examples/model-conversion/scripts"], "pythonVersion": "3.9", "pythonPlatform": "All", "reportUnusedImport": "warning", diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 86a1a4ba18..386fab04ac 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u udata->seq_idx .resize(LLAMA_MAX_SEQ, -1); udata->output .resize(n_tokens); + udata->seq_id_data.reserve(n_tokens); + seq_set_t seq_set_unq; for (size_t i = 0; i < idxs.size(); ++i) { @@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u } udata->n_seq_id[i] = batch.n_seq_id[idxs[i]]; - udata->seq_id[i] = batch.seq_id[idxs[i]]; udata->output[i] = batch.logits[idxs[i]]; for (int s = 0; s < udata->n_seq_id[i]; ++s) { - seq_set_unq.set(udata->seq_id[i][s]); + const llama_seq_id seq_id = batch.seq_id[idxs[i]][s]; + + udata->seq_id_data.push_back(seq_id); + seq_set_unq.set(seq_id); } if (udata->output[i]) { @@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector & idxs, u } } + llama_seq_id * seq_id_ptr = udata->seq_id_data.data(); + for (size_t i = 0; i < idxs.size(); ++i) { + udata->seq_id[i] = seq_id_ptr; + seq_id_ptr += udata->n_seq_id[i]; + } + for (uint32_t s = 0; s < n_seq_max; ++s) { if (seq_set_unq.test(s)) { udata->seq_idx[s] = udata->seq_id_unq.size(); diff --git a/src/llama-batch.h b/src/llama-batch.h index 209cf3699d..8e6fac0efa 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -56,13 +56,15 @@ struct llama_ubatch { std::vector embd; std::vector pos; std::vector n_seq_id; - std::vector seq_id; + std::vector seq_id; // these point into the seq_id_data below std::vector seq_id_unq; std::vector seq_idx; std::vector output; + + std::vector seq_id_data; }; - // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data + // the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data std::shared_ptr data; }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 47484d9d97..97fec908b5 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), - attn_factor (cparams.yarn_attn_factor), + attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)), beta_fast (cparams.yarn_beta_fast), beta_slow (cparams.yarn_beta_slow), norm_eps (hparams.f_norm_eps), diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 8cdbaf69fc..277d0bcfd3 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1,7 +1,9 @@ #include "llama-hparams.h" #include "ggml.h" + #include +#include void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { @@ -229,3 +231,13 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama return false; } + +float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) { + GGML_ASSERT(ext_factor >= 0.0f); + + if (ext_factor != 0.0f) { + attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); + } + + return attn_factor; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6eff334a5f..c9960e9169 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -107,6 +107,7 @@ struct llama_hparams { float rope_freq_base_train_swa; float rope_freq_scale_train; float rope_freq_scale_train_swa; + uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; @@ -267,7 +268,13 @@ struct llama_hparams { // TODO: think of a better place for this function // TODO: pack the SWA params in a struct? static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1); + + // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor: + // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544 + // + // ref: https://github.com/ggml-org/llama.cpp/discussions/7416 + // https://github.com/ggml-org/llama.cpp/pull/17945 + static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor); }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); - diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 58f4a198bb..51005ab31a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift( float freq_scale) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_beta_fast = cparams.yarn_beta_fast; - const auto & yarn_beta_slow = cparams.yarn_beta_slow; + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor); const auto & n_rot = hparams.n_rot; const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE @@ -1384,9 +1385,11 @@ ggml_tensor * llama_kv_cache::build_rope_shift( // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. + /* const float yarn_attn_factor = (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_DEEPSEEK2OCR) ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; + */ ggml_tensor * tmp; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d94354da20..ffaf54f038 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1639,7 +1639,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { // that have no expert_gating_func model parameter set hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); + + if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) { + // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + // cancel the factor from the convert script + hparams.rope_yarn_log_mul /= 0.1f; + } // (optional) temperature tuning - used by mistral-large ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); @@ -2272,9 +2277,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); - ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); - ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); - ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f); // TODO: maybe add n_attn_temp_floor_scale as a separate KV? if (hparams.f_attn_temp_scale != 0.0f) { @@ -2284,18 +2289,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } - // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f - // but may need further verification with other values - if (hparams.rope_yarn_log_mul != 0.0f) { - float factor = 1.0f / hparams.rope_freq_scale_train; - float mscale = 1.0f; - float mscale_all_dims = hparams.rope_yarn_log_mul; - static auto get_mscale = [](float scale, float mscale) { - return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f); - }; - hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims); - } - switch (hparams.n_layer) { case 26: type = LLM_TYPE_3B; break; case 34: type = LLM_TYPE_8B; break; @@ -2306,6 +2299,32 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: throw std::runtime_error("unsupported model architecture"); } + // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348 + if (hparams.rope_yarn_log_mul != 0.0f) { + const float factor = 1.0f / hparams.rope_freq_scale_train; + + // note: here we assume `mscale == 1.0f` + // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f + float mscale = 1.0f; + const float mscale_all_dims = hparams.rope_yarn_log_mul; + + // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + // special-case DEEPSEEK v2: + // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43 + if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) { + mscale = mscale_all_dims; + } + + static auto get_mscale = [](float scale, float mscale) { + return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f); + }; + + hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims); + + LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n", + __func__, hparams.yarn_attn_factor, mscale, mscale_all_dims); + } + pimpl->n_bytes = ml.n_bytes; pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name(); @@ -6842,6 +6861,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); + LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { @@ -6905,7 +6925,6 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); } if (arch == LLM_ARCH_QWEN2MOE) { diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 1599ea0d96..2ac31610ed 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B @@ -21,9 +19,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); - const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k)); - const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); + // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + + // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor + GGML_ASSERT(ext_factor >= 0.0f); + const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale)); + + // use the original attn_factor to pre-scale the kq_scale + const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); + const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k)); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index a60ca12fe5..90750b20c2 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -20,20 +20,20 @@ int main(void) { std::unordered_set seen_env_vars; for (const auto & opt : ctx_arg.options) { // check for args duplications - for (const auto & arg : opt.args) { + for (const auto & arg : opt.get_args()) { if (seen_args.find(arg) == seen_args.end()) { seen_args.insert(arg); } else { - fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg); + fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str()); exit(1); } } // check for env var duplications - if (opt.env) { - if (seen_env_vars.find(opt.env) == seen_env_vars.end()) { - seen_env_vars.insert(opt.env); + for (const auto & env : opt.get_env()) { + if (seen_env_vars.find(env) == seen_env_vars.end()) { + seen_env_vars.insert(env); } else { - fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env); + fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str()); exit(1); } } @@ -115,6 +115,14 @@ int main(void) { assert(params.model.path == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); + printf("test-arg-parser: test negated environment variables\n\n"); + + setenv("LLAMA_ARG_MMAP", "0", true); + setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.use_mmap == false); + assert(params.no_perf == true); printf("test-arg-parser: test environment variables being overwritten\n\n"); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7be1f66038..416218b5b8 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7652,6 +7652,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f)); + for (float max_bias : {0.0f, 8.0f}) { for (float scale : {1.0f, 0.1f}) { for (int64_t ne0 : {16, 1024}) { @@ -7861,9 +7864,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 30, 30, 7, 1 }, { 8, 30, 7, 1 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 64, 64, 2, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 79, 79, 5, 3 }, { 417, 79, 5, 3 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 80, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 79, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 2, 8 }, { 81, 80, 2, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 80, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 79, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 80, 80, 8, 8 }, { 81, 80, 8, 8 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 84, 84, 4, 4 }, { 32, 84, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 95, 95, 8, 8 }, { 40, 95, 8, 8 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 })); test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 32, 128, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 3, 4 }, { 32, 128, 3, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 32, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 200, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 384, 64, 4, 4 })); for (bool v : {false, true}) { for (bool circular : {false, true}) { @@ -7956,8 +7974,12 @@ static std::vector> make_test_cases_eval() { for (bool with_norm : {false, true}) { test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm)); + test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm)); test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm)); + test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm)); + test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm)); test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm)); + test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm)); } test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true)); @@ -8064,12 +8086,13 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416)); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 })); - test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 32, 64, 4, 4 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 2 }, { 32, 128, 4, 2 })); // qwen3next with CHUNK_SIZE 64 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 })); // qwen3next with CHUNK_SIZE 128 test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 })); + test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 256, 256, 4, 2 }, { 128, 256, 4, 2 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 })); test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 })); diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 13ab7c78f4..6b80fe45f7 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -6,11 +6,26 @@ add_library(mtmd mtmd.cpp mtmd-audio.cpp mtmd.h + mtmd-helper.cpp + mtmd-helper.h clip.cpp clip.h clip-impl.h - mtmd-helper.cpp - mtmd-helper.h + clip-model.h + clip-graph.h + models/models.h + models/cogvlm.cpp + models/internvl.cpp + models/kimivl.cpp + models/llama4.cpp + models/llava.cpp + models/minicpmv.cpp + models/pixtral.cpp + models/qwen2vl.cpp + models/qwen3vl.cpp + models/siglip.cpp + models/whisper-enc.cpp + models/deepseekocr.cpp ) set_target_properties(mtmd PROPERTIES @@ -53,6 +68,15 @@ if (TARGET BUILD_INFO) add_dependencies(mtmd-helper BUILD_INFO) endif() +# if mtmd is linked against common, we throw an error +if (TARGET mtmd) + get_target_property(libs mtmd LINK_LIBRARIES) + if (libs AND "common" IN_LIST libs) + message(FATAL_ERROR "mtmd is designed to be a public library.\n" + "It must not link against common") + endif() +endif() + add_executable(llama-llava-cli deprecation-warning.cpp) add_executable(llama-gemma3-cli deprecation-warning.cpp) add_executable(llama-minicpmv-cli deprecation-warning.cpp) diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h new file mode 100644 index 0000000000..6d303b4e48 --- /dev/null +++ b/tools/mtmd/clip-graph.h @@ -0,0 +1,115 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cpp.h" +#include "clip.h" +#include "clip-impl.h" +#include "clip-model.h" + +#include +#include + +struct clip_graph { + const clip_model & model; + const clip_hparams & hparams; + projector_type proj_type; + + // we only support single image per batch + const clip_image_f32 & img; + + const int patch_size; + const int n_patches_x; + const int n_patches_y; + const int n_patches; + const int n_embd; + const int n_head; + const int d_head; + const int n_layer; + const int n_mmproj_embd; + const float eps; + const float kq_scale; + const clip_flash_attn_type flash_attn_type; + + // for debugging + const bool debug_graph; + std::vector & debug_print_tensors; + + ggml_context_ptr ctx0_ptr; + ggml_context * ctx0; + ggml_cgraph * gf; + + clip_graph(clip_ctx * ctx, const clip_image_f32 & img); + + virtual ~clip_graph() = default; + virtual ggml_cgraph * build() = 0; + + // + // utility functions + // + void cb(ggml_tensor * cur0, const char * name, int il) const; + + // siglip2 naflex + ggml_tensor * resize_position_embeddings(); + + // build vision transformer (ViT) cgraph + // this function should cover most of the models + // if your model has specific features, you should probably duplicate this function + ggml_tensor * build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos); + + // build the input after conv2d (inp_raw --> patches) + // returns tensor with shape [n_embd, n_patches] + ggml_tensor * build_inp(); + + ggml_tensor * build_inp_raw(int channels = 3); + + ggml_tensor * build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const; + + ggml_tensor * build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const; + + ggml_tensor * build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const; + + // implementation of the 2D RoPE without adding a new op in ggml + // this is not efficient (use double the memory), but works on all backends + // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 + ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq + ); + + // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) + // support dynamic resolution + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor); +}; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index f8c2a105df..4859376922 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -1,3 +1,5 @@ +#pragma once + #include "ggml.h" #include "gguf.h" #include "clip.h" @@ -14,6 +16,8 @@ // Internal header for clip.cpp +#define MTMD_INTERNAL_HEADER + #define KEY_FTYPE "general.file_type" #define KEY_NAME "general.name" #define KEY_DESCRIPTION "general.description" @@ -150,6 +154,10 @@ // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) +// forward declaration +// TODO: improve this later +struct clip_ctx; + enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h new file mode 100644 index 0000000000..46577a2a4a --- /dev/null +++ b/tools/mtmd/clip-model.h @@ -0,0 +1,324 @@ +#pragma once + +#include "ggml.h" +#include "clip.h" +#include "clip-impl.h" + +#include +#include +#include +#include + +enum ffn_op_type { + FFN_GELU, + FFN_GELU_ERF, + FFN_SILU, + FFN_GELU_QUICK, +}; + +enum norm_type { + NORM_TYPE_NORMAL, + NORM_TYPE_RMS, +}; + +enum patch_merge_type { + PATCH_MERGE_FLAT, + PATCH_MERGE_SPATIAL_UNPAD, +}; + +struct clip_hparams { + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t n_embd = 0; + int32_t n_ff = 0; + int32_t projection_dim = 0; + int32_t n_head = 0; + int32_t n_layer = 0; + // idefics3 + int32_t image_longest_edge = 0; + int32_t image_min_pixels = -1; + int32_t image_max_pixels = -1; + int32_t n_merge = 0; // number of patch merges **per-side** + + float image_mean[3]; + float image_std[3]; + + // for models using dynamic image size, we need to have a smaller image size to warmup + // otherwise, user will get OOM everytime they load the model + int32_t warmup_image_size = 0; + int32_t warmup_audio_size = 3000; + + ffn_op_type ffn_op = FFN_GELU; + + patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; + + float eps = 1e-6; + float rope_theta = 0.0; + + std::vector image_res_candidates; // for llava-uhd style models + int32_t image_crop_resolution; + std::unordered_set vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; + + // deepseek-ocr (sam) + int32_t sam_n_layer = 0; + int32_t sam_n_head = 0; + int32_t sam_n_embd = 0; + + // audio + int32_t n_mel_bins = 0; // whisper preprocessor + int32_t proj_stack_factor = 0; // ultravox + + // legacy + bool has_llava_projector = false; + int minicpmv_version = 0; + int32_t minicpmv_query_num = 0; // MiniCPM-V query number + + // custom value provided by user, can be undefined if not set + int32_t custom_image_min_tokens = -1; + int32_t custom_image_max_tokens = -1; + + void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; + image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; + warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + } + + void set_warmup_n_tokens(int n_tokens) { + int n_tok_per_side = static_cast(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; + // TODO: support warmup size for custom token numbers + } + // sam vit deepseek-ocr + std::vector global_attn_indices() const { + return { 2, 5, 8, 11 }; + } + bool is_global_attn(int32_t layer) const { + const auto indices = global_attn_indices(); + + for (const auto & idx : indices) { + if (layer == idx) { + return true; + } + } + + return false; + } +}; + +struct clip_layer { + // attention + ggml_tensor * k_w = nullptr; + ggml_tensor * k_b = nullptr; + ggml_tensor * q_w = nullptr; + ggml_tensor * q_b = nullptr; + ggml_tensor * v_w = nullptr; + ggml_tensor * v_b = nullptr; + ggml_tensor * qkv_w = nullptr; + ggml_tensor * qkv_b = nullptr; + + ggml_tensor * o_w = nullptr; + ggml_tensor * o_b = nullptr; + + ggml_tensor * k_norm = nullptr; + ggml_tensor * q_norm = nullptr; + + // layernorm 1 + ggml_tensor * ln_1_w = nullptr; + ggml_tensor * ln_1_b = nullptr; + + ggml_tensor * ff_up_w = nullptr; + ggml_tensor * ff_up_b = nullptr; + ggml_tensor * ff_gate_w = nullptr; + ggml_tensor * ff_gate_b = nullptr; + ggml_tensor * ff_down_w = nullptr; + ggml_tensor * ff_down_b = nullptr; + + // layernorm 2 + ggml_tensor * ln_2_w = nullptr; + ggml_tensor * ln_2_b = nullptr; + + // layer scale (no bias) + ggml_tensor * ls_1_w = nullptr; + ggml_tensor * ls_2_w = nullptr; + + // qwen3vl deepstack merger + ggml_tensor * deepstack_norm_w = nullptr; + ggml_tensor * deepstack_norm_b = nullptr; + ggml_tensor * deepstack_fc1_w = nullptr; + ggml_tensor * deepstack_fc1_b = nullptr; + ggml_tensor * deepstack_fc2_w = nullptr; + ggml_tensor * deepstack_fc2_b = nullptr; + + // sam rel_pos + ggml_tensor * rel_pos_w = nullptr; + ggml_tensor * rel_pos_h = nullptr; + + bool has_deepstack() const { + return deepstack_fc1_w != nullptr; + } +}; + +struct clip_model { + clip_modality modality = CLIP_MODALITY_VISION; + projector_type proj_type = PROJECTOR_TYPE_MLP; + clip_hparams hparams; + + // embeddings + ggml_tensor * class_embedding = nullptr; + ggml_tensor * patch_embeddings_0 = nullptr; + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_bias = nullptr; + ggml_tensor * position_embeddings = nullptr; + + ggml_tensor * pre_ln_w = nullptr; + ggml_tensor * pre_ln_b = nullptr; + + std::vector layers; + + int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer + + ggml_tensor * post_ln_w; + ggml_tensor * post_ln_b; + + ggml_tensor * fc_w; + ggml_tensor * fc_b; + ggml_tensor * mm_fc_w; + ggml_tensor * mm_fc_b; + + // LLaVA projection + ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_input_norm_b = nullptr; + ggml_tensor * mm_0_w = nullptr; + ggml_tensor * mm_0_b = nullptr; + ggml_tensor * mm_2_w = nullptr; + ggml_tensor * mm_2_b = nullptr; + + ggml_tensor * image_newline = nullptr; + ggml_tensor * view_seperator = nullptr; + + + // Yi type models with mlp+normalization projection + ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 + ggml_tensor * mm_1_b = nullptr; + ggml_tensor * mm_3_w = nullptr; + ggml_tensor * mm_3_b = nullptr; + ggml_tensor * mm_4_w = nullptr; + ggml_tensor * mm_4_b = nullptr; + + // GLMV-Edge projection + ggml_tensor * mm_model_adapter_conv_w = nullptr; + ggml_tensor * mm_model_adapter_conv_b = nullptr; + + // MobileVLM projection + ggml_tensor * mm_model_mlp_1_w = nullptr; + ggml_tensor * mm_model_mlp_1_b = nullptr; + ggml_tensor * mm_model_mlp_3_w = nullptr; + ggml_tensor * mm_model_mlp_3_b = nullptr; + ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; + + // MobileVLM_V2 projection + ggml_tensor * mm_model_mlp_0_w = nullptr; + ggml_tensor * mm_model_mlp_0_b = nullptr; + ggml_tensor * mm_model_mlp_2_w = nullptr; + ggml_tensor * mm_model_mlp_2_b = nullptr; + ggml_tensor * mm_model_peg_0_w = nullptr; + ggml_tensor * mm_model_peg_0_b = nullptr; + + // MINICPMV projection + ggml_tensor * mm_model_pos_embed_k = nullptr; + ggml_tensor * mm_model_query = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_kv_proj = nullptr; + ggml_tensor * mm_model_attn_q_w = nullptr; + ggml_tensor * mm_model_attn_q_b = nullptr; + ggml_tensor * mm_model_attn_k_w = nullptr; + ggml_tensor * mm_model_attn_k_b = nullptr; + ggml_tensor * mm_model_attn_v_w = nullptr; + ggml_tensor * mm_model_attn_v_b = nullptr; + ggml_tensor * mm_model_attn_o_w = nullptr; + ggml_tensor * mm_model_attn_o_b = nullptr; + ggml_tensor * mm_model_ln_q_w = nullptr; + ggml_tensor * mm_model_ln_q_b = nullptr; + ggml_tensor * mm_model_ln_kv_w = nullptr; + ggml_tensor * mm_model_ln_kv_b = nullptr; + ggml_tensor * mm_model_ln_post_w = nullptr; + ggml_tensor * mm_model_ln_post_b = nullptr; + + // gemma3 + ggml_tensor * mm_input_proj_w = nullptr; + ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // pixtral + ggml_tensor * token_embd_img_break = nullptr; + ggml_tensor * mm_patch_merger_w = nullptr; + + // ultravox / whisper encoder + ggml_tensor * conv1d_1_w = nullptr; + ggml_tensor * conv1d_1_b = nullptr; + ggml_tensor * conv1d_2_w = nullptr; + ggml_tensor * conv1d_2_b = nullptr; + ggml_tensor * mm_norm_pre_w = nullptr; + ggml_tensor * mm_norm_mid_w = nullptr; + + // cogvlm + ggml_tensor * mm_post_fc_norm_w = nullptr; + ggml_tensor * mm_post_fc_norm_b = nullptr; + ggml_tensor * mm_h_to_4h_w = nullptr; + ggml_tensor * mm_gate_w = nullptr; + ggml_tensor * mm_4h_to_h_w = nullptr; + ggml_tensor * mm_boi = nullptr; + ggml_tensor * mm_eoi = nullptr; + + // deepseek ocr sam + ggml_tensor * patch_embed_proj_w = nullptr; + ggml_tensor * patch_embed_proj_b = nullptr; + ggml_tensor * pos_embed = nullptr; + + ggml_tensor * neck_0_w; + ggml_tensor * neck_1_w; + ggml_tensor * neck_1_b; + ggml_tensor * neck_2_w; + ggml_tensor * neck_3_w; + ggml_tensor * neck_3_b; + ggml_tensor * net_2; + ggml_tensor * net_3; + + int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder + + std::vector sam_layers; + + bool audio_has_avgpool() const { + return proj_type == PROJECTOR_TYPE_QWEN2A + || proj_type == PROJECTOR_TYPE_VOXTRAL; + } + + bool audio_has_stack_frames() const { + return proj_type == PROJECTOR_TYPE_ULTRAVOX + || proj_type == PROJECTOR_TYPE_VOXTRAL; + } +}; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 10324e165a..507c2d3407 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1,9 +1,9 @@ -// NOTE: This is modified from clip.cpp only for LLaVA, -// so there might be still unnecessary artifacts hanging around -// I'll gradually clean and extend it -// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" #include "clip-impl.h" +#include "clip-model.h" +#include "clip-graph.h" +#include "models/models.h" + #include "ggml.h" #include "ggml-cpp.h" #include "ggml-alloc.h" @@ -27,18 +27,6 @@ struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; -enum ffn_op_type { - FFN_GELU, - FFN_GELU_ERF, - FFN_SILU, - FFN_GELU_QUICK, -}; - -enum norm_type { - NORM_TYPE_NORMAL, - NORM_TYPE_RMS, -}; - //#define CLIP_DEBUG_FUNCTIONS #ifdef CLIP_DEBUG_FUNCTIONS @@ -150,313 +138,6 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u #endif -// -// clip layers -// - -enum patch_merge_type { - PATCH_MERGE_FLAT, - PATCH_MERGE_SPATIAL_UNPAD, -}; - -struct clip_hparams { - int32_t image_size = 0; - int32_t patch_size = 0; - int32_t n_embd = 0; - int32_t n_ff = 0; - int32_t projection_dim = 0; - int32_t n_head = 0; - int32_t n_layer = 0; - // idefics3 - int32_t image_longest_edge = 0; - int32_t image_min_pixels = -1; - int32_t image_max_pixels = -1; - int32_t n_merge = 0; // number of patch merges **per-side** - - float image_mean[3]; - float image_std[3]; - - // for models using dynamic image size, we need to have a smaller image size to warmup - // otherwise, user will get OOM everytime they load the model - int32_t warmup_image_size = 0; - int32_t warmup_audio_size = 3000; - - ffn_op_type ffn_op = FFN_GELU; - - patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; - - float eps = 1e-6; - float rope_theta = 0.0; - - std::vector image_res_candidates; // for llava-uhd style models - int32_t image_crop_resolution; - std::unordered_set vision_feature_layer; - int32_t attn_window_size = 0; - int32_t n_wa_pattern = 0; - - // deepseek-ocr (sam) - int32_t sam_n_layer = 0; - int32_t sam_n_head = 0; - int32_t sam_n_embd = 0; - - // audio - int32_t n_mel_bins = 0; // whisper preprocessor - int32_t proj_stack_factor = 0; // ultravox - - // legacy - bool has_llava_projector = false; - int minicpmv_version = 0; - int32_t minicpmv_query_num = 0; // MiniCPM-V query number - - // custom value provided by user, can be undefined if not set - int32_t custom_image_min_tokens = -1; - int32_t custom_image_max_tokens = -1; - - void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { - const int cur_merge = n_merge == 0 ? 1 : n_merge; - const int patch_area = patch_size * patch_size * cur_merge * cur_merge; - image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; - image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; - warmup_image_size = static_cast(std::sqrt(image_max_pixels)); - } - - void set_warmup_n_tokens(int n_tokens) { - int n_tok_per_side = static_cast(std::sqrt(n_tokens)); - GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - const int cur_merge = n_merge == 0 ? 1 : n_merge; - warmup_image_size = n_tok_per_side * patch_size * cur_merge; - // TODO: support warmup size for custom token numbers - } - - // sam vit deepseek-ocr - std::vector global_attn_indices() const { - return { 2, 5, 8, 11 }; - } - - bool is_global_attn(int32_t layer) const { - const auto indices = global_attn_indices(); - - for (const auto & idx : indices) { - if (layer == idx) { - return true; - } - } - - return false; - } -}; - -struct clip_layer { - // attention - ggml_tensor * k_w = nullptr; - ggml_tensor * k_b = nullptr; - ggml_tensor * q_w = nullptr; - ggml_tensor * q_b = nullptr; - ggml_tensor * v_w = nullptr; - ggml_tensor * v_b = nullptr; - ggml_tensor * qkv_w = nullptr; - ggml_tensor * qkv_b = nullptr; - - ggml_tensor * o_w = nullptr; - ggml_tensor * o_b = nullptr; - - ggml_tensor * k_norm = nullptr; - ggml_tensor * q_norm = nullptr; - - // layernorm 1 - ggml_tensor * ln_1_w = nullptr; - ggml_tensor * ln_1_b = nullptr; - - ggml_tensor * ff_up_w = nullptr; - ggml_tensor * ff_up_b = nullptr; - ggml_tensor * ff_gate_w = nullptr; - ggml_tensor * ff_gate_b = nullptr; - ggml_tensor * ff_down_w = nullptr; - ggml_tensor * ff_down_b = nullptr; - - // layernorm 2 - ggml_tensor * ln_2_w = nullptr; - ggml_tensor * ln_2_b = nullptr; - - // layer scale (no bias) - ggml_tensor * ls_1_w = nullptr; - ggml_tensor * ls_2_w = nullptr; - - // qwen3vl deepstack merger - ggml_tensor * deepstack_norm_w = nullptr; - ggml_tensor * deepstack_norm_b = nullptr; - ggml_tensor * deepstack_fc1_w = nullptr; - ggml_tensor * deepstack_fc1_b = nullptr; - ggml_tensor * deepstack_fc2_w = nullptr; - ggml_tensor * deepstack_fc2_b = nullptr; - - bool has_deepstack() const { - return deepstack_fc1_w != nullptr; - } - - // sam rel_pos - ggml_tensor * rel_pos_w = nullptr; - ggml_tensor * rel_pos_h = nullptr; -}; - -struct clip_model { - clip_modality modality = CLIP_MODALITY_VISION; - projector_type proj_type = PROJECTOR_TYPE_MLP; - clip_hparams hparams; - - // embeddings - ggml_tensor * class_embedding = nullptr; - ggml_tensor * patch_embeddings_0 = nullptr; - ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) - ggml_tensor * patch_bias = nullptr; - ggml_tensor * position_embeddings = nullptr; - - ggml_tensor * pre_ln_w = nullptr; - ggml_tensor * pre_ln_b = nullptr; - - std::vector layers; - - int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer - - ggml_tensor * post_ln_w; - ggml_tensor * post_ln_b; - - ggml_tensor * fc_w; - ggml_tensor * fc_b; - ggml_tensor * mm_fc_w; - ggml_tensor * mm_fc_b; - - // LLaVA projection - ggml_tensor * mm_input_norm_w = nullptr; - ggml_tensor * mm_input_norm_b = nullptr; - ggml_tensor * mm_0_w = nullptr; - ggml_tensor * mm_0_b = nullptr; - ggml_tensor * mm_2_w = nullptr; - ggml_tensor * mm_2_b = nullptr; - - ggml_tensor * image_newline = nullptr; - ggml_tensor * view_seperator = nullptr; - - // Yi type models with mlp+normalization projection - ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 - ggml_tensor * mm_1_b = nullptr; - ggml_tensor * mm_3_w = nullptr; - ggml_tensor * mm_3_b = nullptr; - ggml_tensor * mm_4_w = nullptr; - ggml_tensor * mm_4_b = nullptr; - - // GLMV-Edge projection - ggml_tensor * mm_model_adapter_conv_w = nullptr; - ggml_tensor * mm_model_adapter_conv_b = nullptr; - - // MobileVLM projection - ggml_tensor * mm_model_mlp_1_w = nullptr; - ggml_tensor * mm_model_mlp_1_b = nullptr; - ggml_tensor * mm_model_mlp_3_w = nullptr; - ggml_tensor * mm_model_mlp_3_b = nullptr; - ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; - ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; - ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; - ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; - ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; - ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; - ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; - ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; - ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; - ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; - ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; - ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; - - // MobileVLM_V2 projection - ggml_tensor * mm_model_mlp_0_w = nullptr; - ggml_tensor * mm_model_mlp_0_b = nullptr; - ggml_tensor * mm_model_mlp_2_w = nullptr; - ggml_tensor * mm_model_mlp_2_b = nullptr; - ggml_tensor * mm_model_peg_0_w = nullptr; - ggml_tensor * mm_model_peg_0_b = nullptr; - - // MINICPMV projection - ggml_tensor * mm_model_pos_embed_k = nullptr; - ggml_tensor * mm_model_query = nullptr; - ggml_tensor * mm_model_proj = nullptr; - ggml_tensor * mm_model_kv_proj = nullptr; - ggml_tensor * mm_model_attn_q_w = nullptr; - ggml_tensor * mm_model_attn_q_b = nullptr; - ggml_tensor * mm_model_attn_k_w = nullptr; - ggml_tensor * mm_model_attn_k_b = nullptr; - ggml_tensor * mm_model_attn_v_w = nullptr; - ggml_tensor * mm_model_attn_v_b = nullptr; - ggml_tensor * mm_model_attn_o_w = nullptr; - ggml_tensor * mm_model_attn_o_b = nullptr; - ggml_tensor * mm_model_ln_q_w = nullptr; - ggml_tensor * mm_model_ln_q_b = nullptr; - ggml_tensor * mm_model_ln_kv_w = nullptr; - ggml_tensor * mm_model_ln_kv_b = nullptr; - ggml_tensor * mm_model_ln_post_w = nullptr; - ggml_tensor * mm_model_ln_post_b = nullptr; - - // gemma3 - ggml_tensor * mm_input_proj_w = nullptr; - ggml_tensor * mm_soft_emb_norm_w = nullptr; - - // pixtral - ggml_tensor * token_embd_img_break = nullptr; - ggml_tensor * mm_patch_merger_w = nullptr; - - // ultravox / whisper encoder - ggml_tensor * conv1d_1_w = nullptr; - ggml_tensor * conv1d_1_b = nullptr; - ggml_tensor * conv1d_2_w = nullptr; - ggml_tensor * conv1d_2_b = nullptr; - ggml_tensor * mm_norm_pre_w = nullptr; - ggml_tensor * mm_norm_mid_w = nullptr; - - // cogvlm - ggml_tensor * mm_post_fc_norm_w = nullptr; - ggml_tensor * mm_post_fc_norm_b = nullptr; - ggml_tensor * mm_h_to_4h_w = nullptr; - ggml_tensor * mm_gate_w = nullptr; - ggml_tensor * mm_4h_to_h_w = nullptr; - ggml_tensor * mm_boi = nullptr; - ggml_tensor * mm_eoi = nullptr; - - // deepseek ocr sam - ggml_tensor * patch_embed_proj_w = nullptr; - ggml_tensor * patch_embed_proj_b = nullptr; - ggml_tensor * pos_embed = nullptr; - - bool audio_has_avgpool() const { - return proj_type == PROJECTOR_TYPE_QWEN2A - || proj_type == PROJECTOR_TYPE_VOXTRAL; - } - - bool audio_has_stack_frames() const { - return proj_type == PROJECTOR_TYPE_ULTRAVOX - || proj_type == PROJECTOR_TYPE_VOXTRAL; - } - ggml_tensor * neck_0_w; - ggml_tensor * neck_1_w; - ggml_tensor * neck_1_b; - ggml_tensor * neck_2_w; - ggml_tensor * neck_3_w; - ggml_tensor * neck_3_b; - ggml_tensor * net_2; - ggml_tensor * net_3; - - int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder - - std::vector sam_layers; - -}; - struct clip_ctx { clip_model model; @@ -539,2431 +220,618 @@ struct clip_ctx { } }; -struct clip_graph { - clip_ctx * ctx; - const clip_model & model; - const clip_hparams & hparams; +// +// clip_graph +// - // we only support single image per batch - const clip_image_f32 & img; +clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : + model(ctx->model), + hparams(model.hparams), + proj_type(ctx->proj_type()), + img(img), + patch_size(hparams.patch_size), + n_patches_x(img.nx / patch_size), + n_patches_y(img.ny / patch_size), + n_patches(n_patches_x * n_patches_y), + n_embd(hparams.n_embd), + n_head(hparams.n_head), + d_head(n_embd / n_head), + n_layer(hparams.n_layer), + n_mmproj_embd(clip_n_mmproj_embd(ctx)), + eps(hparams.eps), + kq_scale(1.0f / sqrtf((float)d_head)), + flash_attn_type(ctx->flash_attn_type), + debug_graph(ctx->debug_graph), + debug_print_tensors(ctx->debug_print_tensors) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0_ptr.reset(ggml_init(params)); + ctx0 = ctx0_ptr.get(); + gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); +} - const int patch_size; - const int n_patches_x; - const int n_patches_y; - const int n_patches; - const int n_embd; - const int n_head; - const int d_head; - const int n_layer; - const float eps; - const float kq_scale; - - ggml_context_ptr ctx0_ptr; - ggml_context * ctx0; - ggml_cgraph * gf; - - clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : - ctx(ctx), - model(ctx->model), - hparams(model.hparams), - img(img), - patch_size(hparams.patch_size), - n_patches_x(img.nx / patch_size), - n_patches_y(img.ny / patch_size), - n_patches(n_patches_x * n_patches_y), - n_embd(hparams.n_embd), - n_head(hparams.n_head), - d_head(n_embd / n_head), - n_layer(hparams.n_layer), - eps(hparams.eps), - kq_scale(1.0f / sqrtf((float)d_head)) { - struct ggml_init_params params = { - /*.mem_size =*/ ctx->buf_compute_meta.size(), - /*.mem_buffer =*/ ctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - ctx0_ptr.reset(ggml_init(params)); - ctx0 = ctx0_ptr.get(); - gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); - } - - ggml_cgraph * build_siglip() { - ggml_tensor * inp = build_inp(); - - ggml_tensor * learned_pos_embd = model.position_embeddings; - if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { - learned_pos_embd = resize_position_embeddings(); - } - - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - nullptr); - - if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) { - const int batch_size = 1; - GGML_ASSERT(n_patches_x == n_patches_y); - const int patches_per_image = n_patches_x; - const int kernel_size = hparams.n_merge; - - cur = ggml_transpose(ctx0, cur); - cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); - - // doing a pool2d to reduce the number of output tokens - cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); - cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - // apply norm before projection - cur = ggml_rms_norm(ctx0, cur, eps); - cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); - - // apply projection - cur = ggml_mul_mat(ctx0, - ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), - cur); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { - // pixel_shuffle - // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.n_merge; - cur = build_patch_merge_permute(cur, scale_factor); - cur = ggml_mul_mat(ctx0, model.fc_w, cur); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { - // pixel unshuffle block - const int scale_factor = model.hparams.n_merge; - cur = build_patch_merge_permute(cur, scale_factor); - - // projection - cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm - cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); - cur = ggml_add(ctx0, cur, model.mm_input_norm_b); - - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU, - -1); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) { - cur = build_ffn(cur, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - hparams.ffn_op, - -1); - - } else { - GGML_ABORT("SigLIP: Unsupported projector type"); - } - - // build the graph +void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { + if (debug_graph) { + ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0)); + std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name; + ggml_set_name(cur, cur_name.c_str()); + ggml_set_output(cur); ggml_build_forward_expand(gf, cur); + debug_print_tensors.push_back(cur); + } +} - return gf; +// siglip2 naflex +ggml_tensor * clip_graph::resize_position_embeddings() { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; + const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); + + GGML_ASSERT(pos_embd); + + if (height == n_per_side && width == n_per_side) { + return pos_embd; } - ggml_cgraph * build_deepseek_ocr() { - //patch embedding - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * sam_out = build_sam(inp_raw); - ggml_tensor * clip_out = build_dsocr_clip(sam_out); - - int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; - - sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); - sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); - clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); - - ggml_tensor * cur; - cur = ggml_concat(ctx0, clip_out, sam_out, 0); - cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); - cur = ggml_cont(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.fc_w, cur); - cur = ggml_add(ctx0, cur, model.fc_b); - - const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); - const auto w = h; - const auto n_dim = cur->ne[0]; - - ggml_tensor * imgnl; - ggml_tensor * vs; - - imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); - vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) - cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); - cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); - cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) - - cb(cur, "dsocr_output", -1); - - ggml_build_forward_expand(gf, cur); - return gf; - } - - ggml_cgraph * build_pixtral() { - const int n_merge = hparams.n_merge; - - // 2D input positions - ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - - ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true); - }; - - ggml_tensor * inp = build_inp(); - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_RMS, - hparams.ffn_op, - nullptr, // no learned pos embd - add_pos); - - // mistral small 3.1 patch merger - // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 - if (model.mm_patch_merger_w) { - GGML_ASSERT(hparams.n_merge > 0); - - cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); - - // reshape image tokens to 2D grid - cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); - cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] - cur = ggml_cont(ctx0, cur); - - // torch.nn.functional.unfold is just an im2col under the hood - // we just need a dummy kernel to make it work - ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); - cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); - - // project to n_embd - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); - } - - // LlavaMultiModalProjector (always using GELU activation) - { - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU, - -1); - } - - // arrangement of the [IMG_BREAK] token - if (model.token_embd_img_break) { - // not efficient, but works - // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] - // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension - // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] - - const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; - const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; - const int p_total = p_x * p_y; - const int n_embd_text = cur->ne[0]; - const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row - - ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); - ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); - tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor - tok = ggml_add(ctx0, tok, model.token_embd_img_break); - tmp = ggml_concat(ctx0, tmp, tok, 1); - cur = ggml_view_2d(ctx0, tmp, - n_embd_text, n_tokens_output, - ggml_row_size(tmp->type, n_embd_text), 0); - } - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // Qwen2VL and Qwen2.5VL use M-RoPE - ggml_cgraph * build_qwen2vl() { - GGML_ASSERT(model.patch_bias == nullptr); - GGML_ASSERT(model.class_embedding == nullptr); - - const int batch_size = 1; - const bool use_window_attn = hparams.n_wa_pattern > 0; - const int n_wa_pattern = hparams.n_wa_pattern; - const int n_pos = n_patches; - const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position - - norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL - ? NORM_TYPE_RMS // qwen 2.5 vl - : NORM_TYPE_NORMAL; // qwen 2 vl - - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); - - // second conv dimension - { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_cont_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - inp = ggml_reshape_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); - inp = ggml_cont_3d( - ctx0, inp, - n_embd, n_patches_x * n_patches_y, batch_size); - } - - ggml_tensor * inpL = inp; - ggml_tensor * window_mask = nullptr; - ggml_tensor * window_idx = nullptr; - ggml_tensor * inv_window_idx = nullptr; - - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - } - - if (use_window_attn) { - // handle window attention inputs - inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); - ggml_set_name(inv_window_idx, "inv_window_idx"); - ggml_set_input(inv_window_idx); - // mask for window attention - window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); - ggml_set_name(window_mask, "window_mask"); - ggml_set_input(window_mask); - - // if flash attn is used, we need to pad the mask and cast to f16 - if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { - window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); - } - - // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] - GGML_ASSERT(batch_size == 1); - inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); - inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); - inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) { - const auto & layer = model.layers[il]; - const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; - - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "ln1", il); - - // self-attention - { - ggml_tensor * Qcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); - ggml_tensor * Kcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); - ggml_tensor * Vcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); - - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - // apply M-RoPE - Qcur = ggml_rope_multi( - ctx0, Qcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - Kcur = ggml_rope_multi( - ctx0, Kcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - - cb(Qcur, "Qcur_rope", il); - cb(Kcur, "Kcur_rope", il); - - ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, attn_mask, kq_scale, il); - cb(cur, "attn_out", il); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - inpL = cur; - } - - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); - } - - // multimodal projection - ggml_tensor * embeddings = inpL; - embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); - embeddings = build_ffn(embeddings, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - FFN_GELU, - -1); - - if (use_window_attn) { - window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); - ggml_set_name(window_idx, "window_idx"); - ggml_set_input(window_idx); - - // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] - GGML_ASSERT(batch_size == 1); - embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); - embeddings = ggml_get_rows(ctx0, embeddings, window_idx); - embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - - // Qwen3VL - ggml_cgraph * build_qwen3vl() { - GGML_ASSERT(model.patch_bias != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - GGML_ASSERT(model.class_embedding == nullptr); - - const int batch_size = 1; - const int n_pos = n_patches; - const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position - - norm_type norm_t = NORM_TYPE_NORMAL; - - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); - - // second conv dimension - { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_cont_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - inp = ggml_reshape_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); - inp = ggml_cont_3d( - ctx0, inp, - n_embd, n_patches_x * n_patches_y, batch_size); - } - - // add patch bias - if (model.patch_bias != nullptr) { - inp = ggml_add(ctx0, inp, model.patch_bias); - cb(inp, "patch_bias", -1); - } - - // calculate absolute position embedding and apply - ggml_tensor * learned_pos_embd = resize_position_embeddings(); - learned_pos_embd = ggml_cont_4d( - ctx0, learned_pos_embd, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - learned_pos_embd = ggml_reshape_4d( - ctx0, learned_pos_embd, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); - learned_pos_embd = ggml_cont_3d( - ctx0, learned_pos_embd, - n_embd, n_patches_x * n_patches_y, batch_size); + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) + + return pos_embd; +} + +// build vision transformer (ViT) cgraph +// this function should cover most of the models +// if your model has specific features, you should probably duplicate this function +ggml_tensor * clip_graph::build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos + ) { + if (learned_pos_embd) { inp = ggml_add(ctx0, inp, learned_pos_embd); - cb(inp, "inp_pos_emb", -1); + cb(inp, "pos_embed", -1); + } - ggml_tensor * inpL = inp; + ggml_tensor * inpL = inp; - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + cb(inpL, "pre_ln", -1); + } - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - } + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] - ggml_tensor * deepstack_features = nullptr; - const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.layers[il]; - - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "ln1", il); - - // self-attention - { + // self-attention + { + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + if (layer.qkv_w != nullptr) { + // fused qkv cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ 0); - - ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, n_embd)); - - ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - // apply M-RoPE - Qcur = ggml_rope_multi( - ctx0, Qcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - Kcur = ggml_rope_multi( - ctx0, Kcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - - cb(Qcur, "Qcur_rope", il); - cb(Kcur, "Kcur_rope", il); - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); - cb(cur, "attn_out", il); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - if (layer.has_deepstack()) { - ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size); - feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il); - feat = build_ffn(feat, - layer.deepstack_fc1_w, layer.deepstack_fc1_b, - nullptr, nullptr, - layer.deepstack_fc2_w, layer.deepstack_fc2_b, - ffn_op_type::FFN_GELU, il); - - if(!deepstack_features) { - deepstack_features = feat; - } else { - // concat along the feature dimension - deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0); + if (layer.qkv_b != nullptr) { + cur = ggml_add(ctx0, cur, layer.qkv_b); } - } - inpL = cur; - } + Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); - } + Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); - // multimodal projection - ggml_tensor * embeddings = inpL; - embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - embeddings = build_ffn(embeddings, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - ffn_op_type::FFN_GELU, -1); + // TODO: q/k norm requires row size == n_embd, while here it's d_head + // we can add support in the future if needed + GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); - embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - - ggml_cgraph * build_minicpmv() { - GGML_ASSERT(model.class_embedding == nullptr); - const int n_pos = n_patches; - const int n_embd_proj = clip_n_mmproj_embd(ctx); - - // position embeddings for the projector (not for ViT) - // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70 - // base frequency omega - ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4); - ggml_set_name(omega, "omega"); - ggml_set_input(omega); - - // 2D input positions (using float for sinusoidal embeddings) - ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - // for selecting learned pos embd, used by ViT - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); - - ggml_tensor * inp = build_inp(); - ggml_tensor * embeddings = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - nullptr); - - // resampler projector (it is just another transformer) - - ggml_tensor * q = model.mm_model_query; - ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - - // norm - q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); - v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); - - // calculate sinusoidal pos embd - ggml_tensor * pos_embed = nullptr; - { - // outer product - ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows - ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w); - ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h); - // sin and cos - ggml_tensor * pos_embd_x = ggml_concat( - ctx0, - ggml_sin(ctx0, theta_x), - ggml_cos(ctx0, theta_x), - 0 // concat on first dim - ); - ggml_tensor * pos_embd_y = ggml_concat( - ctx0, - ggml_sin(ctx0, theta_y), - ggml_cos(ctx0, theta_y), - 0 // concat on first dim - ); - pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0); - } - - // k = v + pos_embed - ggml_tensor * k = ggml_add(ctx0, v, pos_embed); - - // attention - { - const int d_head = 128; - int n_head = n_embd_proj/d_head; - // Use actual config value if available, otherwise fall back to hardcoded values - int num_query = ctx->model.hparams.minicpmv_query_num; - ggml_tensor * Q = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), - model.mm_model_attn_q_b); - ggml_tensor * K = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), - model.mm_model_attn_k_b); - ggml_tensor * V = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), - model.mm_model_attn_v_b); - - Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); - K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); - V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); - - cb(Q, "resampler_Q", -1); - cb(K, "resampler_K", -1); - cb(V, "resampler_V", -1); - - float resampler_kq_scale = 1.0f/ sqrtf(float(d_head)); - embeddings = build_attn( - model.mm_model_attn_o_w, - model.mm_model_attn_o_b, - Q, K, V, nullptr, resampler_kq_scale, -1); - cb(embeddings, "resampler_attn_out", -1); - } - // layernorm - embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); - - // projection - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - - ggml_cgraph * build_internvl() { - GGML_ASSERT(model.class_embedding != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - - const int n_pos = n_patches + 1; - ggml_tensor * inp = build_inp(); - - // add CLS token - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - - // The larger models use a different ViT, which uses RMS norm instead of layer norm - // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 - norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) - ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) - : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) - - ggml_tensor * cur = build_vit( - inp, n_pos, - norm_t, - hparams.ffn_op, - model.position_embeddings, - nullptr); - - // remove CLS token - cur = ggml_view_2d(ctx0, cur, - n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); - - // pixel shuffle - { - const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - const int height = n_patches_y; - const int width = n_patches_x; - GGML_ASSERT(scale_factor > 0); - cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_4d(ctx0, cur, - n_embd * scale_factor * scale_factor, - height / scale_factor, - width / scale_factor, - bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, - n_embd * scale_factor * scale_factor, - cur->ne[1] * cur->ne[2]); - } - - // projector (always using GELU activation) - { - // projector LayerNorm uses pytorch's default eps = 1e-5 - // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 - cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_3_w, model.mm_3_b, - FFN_GELU, - -1); - } - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - ggml_cgraph * build_llama4() { - GGML_ASSERT(model.class_embedding != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - - const int n_pos = n_patches + 1; // +1 for [CLS] - - // 2D input positions - ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - - ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - ggml_tensor * inp = build_inp_raw(); - - // Llama4UnfoldConvolution - { - ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, - patch_size, patch_size, 3, n_embd); - inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); - inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); - inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); - cb(inp, "patch_conv", -1); - } - - // add CLS token - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - - // build ViT with 2D position embeddings - auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - // first half is X axis and second half is Y axis - // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312 - // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441 - return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); - }; - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - model.position_embeddings, - add_pos); - - // remove CLS token - cur = ggml_view_2d(ctx0, cur, - n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); - - // pixel shuffle - // based on Llama4VisionPixelShuffleMLP - // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 - { - const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - GGML_ASSERT(scale_factor > 0); - GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images - cur = ggml_reshape_4d(ctx0, cur, - n_embd * scale_factor, - n_patches_x / scale_factor, - n_patches_y, - bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_4d(ctx0, cur, - n_embd * scale_factor * scale_factor, - n_patches_x / scale_factor, - n_patches_y / scale_factor, - bsz); - //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, - n_embd * scale_factor * scale_factor, - n_patches / scale_factor / scale_factor); - cb(cur, "pixel_shuffle", -1); - } - - // based on Llama4VisionMLP2 (always uses GELU activation, no bias) - { - cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); - cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); - cur = ggml_gelu(ctx0, cur); - cb(cur, "adapter_mlp", -1); - } - - // Llama4MultiModalProjector - cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); - cb(cur, "projected", -1); - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - ggml_cgraph * build_kimivl() { - // 2D input positions - ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - - ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - ggml_tensor * learned_pos_embd = resize_position_embeddings(); - - // build ViT with 2D position embeddings - auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - // first half is X axis and second half is Y axis - return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); - }; - - ggml_tensor * inp = build_inp(); - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - add_pos); - - cb(cur, "vit_out", -1); - - { - // patch_merger - const int scale_factor = model.hparams.n_merge; - cur = build_patch_merge_permute(cur, scale_factor); - - // projection norm - int proj_inp_dim = cur->ne[0]; - cur = ggml_view_2d(ctx0, cur, - n_embd, cur->ne[1] * scale_factor * scale_factor, - ggml_row_size(cur->type, n_embd), 0); - cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm - cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); - cur = ggml_add(ctx0, cur, model.mm_input_norm_b); - cur = ggml_view_2d(ctx0, cur, - proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, - ggml_row_size(cur->type, proj_inp_dim), 0); - cb(cur, "proj_inp_normed", -1); - - // projection mlp - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU, - -1); - cb(cur, "proj_out", -1); - } - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // this graph is used by llava, granite and glm - // due to having embedding_stack (used by granite), we cannot reuse build_vit - ggml_cgraph * build_llava() { - const int batch_size = 1; - const int n_pos = n_patches + (model.class_embedding ? 1 : 0); - - GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); - - // Calculate the deepest feature layer based on hparams and projector type - int max_feature_layer = n_layer; - { - // Get the index of the second to last layer; this is the default for models that have a llava projector - int il_last = hparams.n_layer - 1; - int deepest_feature_layer = -1; - - if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { - il_last += 1; - } - - // If we set explicit vision feature layers, only go up to the deepest one - // NOTE: only used by granite-vision models for now - for (const auto & feature_layer : hparams.vision_feature_layer) { - if (feature_layer > deepest_feature_layer) { - deepest_feature_layer = feature_layer; - } - } - max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; - } - - ggml_tensor * inp = build_inp(); - - // concat class_embeddings and patch_embeddings - if (model.class_embedding) { - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - } - - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); - - ggml_tensor * inpL = inp; - - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); - cb(inpL, "pre_ln", -1); - } - - std::vector embedding_stack; - const auto & vision_feature_layer = hparams.vision_feature_layer; - - // loop over layers - for (int il = 0; il < max_feature_layer; il++) { - auto & layer = model.layers[il]; - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // If this is an embedding feature layer, save the output. - // NOTE: 0 index here refers to the input to the encoder. - if (vision_feature_layer.find(il) != vision_feature_layer.end()) { - embedding_stack.push_back(cur); - } - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "layer_inp_normed", il); - - // self-attention - { - ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + } else { + // separate q, k, v + Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); if (layer.q_b) { Qcur = ggml_add(ctx0, Qcur, layer.q_b); } - ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); if (layer.k_b) { Kcur = ggml_add(ctx0, Kcur, layer.k_b); } - ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); if (layer.v_b) { Vcur = ggml_add(ctx0, Vcur, layer.v_b); } + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); - cb(cur, "attn_out", il); } - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - inpL = cur; - } - - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); - } - - ggml_tensor * embeddings = inpL; - - // process vision feature layers (used by granite) - { - // final layer is a vision feature layer - if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { - embedding_stack.push_back(inpL); - } - - // If feature layers are explicitly set, stack them (if we have multiple) - if (!embedding_stack.empty()) { - embeddings = embedding_stack[0]; - for (size_t i = 1; i < embedding_stack.size(); i++) { - embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); - } - } - } - - // llava projector (also used by granite) - if (ctx->model.hparams.has_llava_projector) { - embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); - - // print_tensor_info(embeddings, "embeddings"); - - // llava projector - if (ctx->proj_type() == PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - - embeddings = ggml_gelu(ctx0, embeddings); - if (model.mm_2_w) { - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - } - } - else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); - // First LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), - model.mm_1_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); - - // Second LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), - model.mm_4_b); - } - else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) { - // MobileVLM projector - int n_patch = 24; - ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); - mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); - mlp_1 = ggml_gelu(ctx0, mlp_1); - ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); - mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); - // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] - - // block 1 - ggml_tensor * block_1 = nullptr; - { - // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] - mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); - mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); - // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); - - // layer norm - // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - - // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // hardswish - ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // residual - block_1 = ggml_add(ctx0, mlp_3, block_1); - } - - // block_2 - { - // stride = 2 - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); - - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // layer norm - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // hardswish - ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - // not sure the parameters is right for globalAvgPooling - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - - // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); - block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); - // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } - embeddings = block_1; - } - else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2) - { - int n_patch = 24; - ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; - } - else { - GGML_ABORT("fatal error"); - } - } - - // glm projector - else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); - embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - // GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_swiglu_split(ctx0, embeddings, x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - // arrangement of BOI/EOI token embeddings - // note: these embeddings are not present in text model, hence we cannot process them as text tokens - // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 - { - embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI - embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI - } - } - - else { - GGML_ABORT("llava: unknown projector type"); - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - // whisper encoder with custom projector - ggml_cgraph * build_whisper_enc() { - const int n_frames = img.nx; - const int n_pos = n_frames / 2; - GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); - - ggml_tensor * inp = build_inp_raw(1); - - // conv1d block - { - // convolution + gelu - ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1); - cur = ggml_add(ctx0, cur, model.conv1d_1_b); - - cur = ggml_gelu_erf(ctx0, cur); - - cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1); - cur = ggml_add(ctx0, cur, model.conv1d_2_b); - - cur = ggml_gelu_erf(ctx0, cur); - // transpose - inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cb(inp, "after_conv1d", -1); - } - - // sanity check (only check one layer, but it should be the same for all) - GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b); - GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b); - GGML_ASSERT(model.layers[0].q_b); - GGML_ASSERT(model.layers[0].v_b); - GGML_ASSERT(!model.layers[0].k_b); // no bias for k - GGML_ASSERT(model.post_ln_w && model.post_ln_b); - - ggml_tensor * pos_embd_selected = ggml_view_2d( - ctx0, model.position_embeddings, - model.position_embeddings->ne[0], n_pos, - model.position_embeddings->nb[1], 0 - ); - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - pos_embd_selected, - nullptr); - - cb(cur, "after_transformer", -1); - - if (model.audio_has_stack_frames()) { - // StackAudioFrames - // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py - int64_t stride = n_embd * hparams.proj_stack_factor; - int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride); - int64_t pad = padded_len - ggml_nelements(cur); - if (pad > 0) { - cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0); - cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); - } - cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, - ggml_row_size(cur->type, stride), 0); - cb(cur, "after_stacked", -1); - } - - if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) { - // UltravoxProjector - // pre-norm - cur = ggml_rms_norm(ctx0, cur, 1e-6); - cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); - - // ffn in - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - - // swiglu - // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half - cur = ggml_swiglu_swapped(ctx0, cur); - - // mid-norm - cur = ggml_rms_norm(ctx0, cur, 1e-6); - cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); - - // ffn out - cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) { - // projector - cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); - cur = ggml_add(ctx0, cur, model.mm_fc_b); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) { - // projector - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU_ERF, - -1); - - } else { - GGML_ABORT("%s: unknown projector type", __func__); - } - - cb(cur, "projected", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // cogvlm vision encoder - ggml_cgraph * build_cogvlm() { - GGML_ASSERT(model.class_embedding != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - - const int n_pos = n_patches + 1; // +1 for [CLS] - - // build input and concatenate class embedding - ggml_tensor * inp = build_inp(); - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - - inp = ggml_add(ctx0, inp, model.position_embeddings); - cb(inp, "inp_pos", -1); - - ggml_tensor * inpL = inp; - - for (int il = 0; il < n_layer; il++) { - auto & layer = model.layers[il]; - ggml_tensor * cur = inpL; - - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - - cur = ggml_add(ctx0, cur, layer.qkv_b); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), - cur->nb[1], 0); - ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), - cur->nb[1], n_embd * sizeof(float)); - ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), - cur->nb[1], 2 * n_embd * sizeof(float)); - cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); + if (add_pos) { + Qcur = add_pos(Qcur, layer); + Kcur = add_pos(Kcur, layer); + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + } + cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il); cb(cur, "attn_out", il); - - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "attn_post_norm", il); - - cur = ggml_add(ctx0, cur, inpL); - inpL = cur; - - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "ffn_post_norm", il); - - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "layer_out", il); - inpL = cur; - } - // remove CLS token (like build_llama4 does) - ggml_tensor * cur = ggml_view_2d(ctx0, inpL, - n_embd, n_patches, - ggml_row_size(inpL->type, n_embd), 0); + if (layer.ls_1_w) { + cur = ggml_mul(ctx0, cur, layer.ls_1_w); + cb(cur, "attn_out_scaled", il); + } - // Multiply with mm_model_proj - cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); - // Apply layernorm, weight, bias - cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + inpL = cur; // inpL = residual, cur = hidden_states - // Apply GELU - cur = ggml_gelu_inplace(ctx0, cur); + cb(cur, "ffn_inp", il); - // Branch 1: multiply with mm_h_to_4h_w - ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); - // Branch 2: multiply with mm_gate_w - ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + ffn_t, il); - // Apply silu - gate = ggml_swiglu_split(ctx0, gate, h_to_4h); + cb(cur, "ffn_out", il); - // Apply mm_4h_to_h_w - cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + if (layer.ls_2_w) { + cur = ggml_mul(ctx0, cur, layer.ls_2_w); + cb(cur, "ffn_out_scaled", il); + } - // Concatenate with boi and eoi - cur = ggml_concat(ctx0, model.mm_boi, cur, 1); - cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; + inpL = cur; } -private: - // - // utility functions - // - - void cb(ggml_tensor * cur0, const char * name, int il) const { - if (ctx->debug_graph) { - ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0)); - std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name; - ggml_set_name(cur, cur_name.c_str()); - ggml_set_output(cur); - ggml_build_forward_expand(gf, cur); - ctx->debug_print_tensors.push_back(cur); - } + if (model.audio_has_avgpool()) { + ggml_tensor * cur = inpL; + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + inpL = cur; } - // siglip2 naflex - ggml_tensor * resize_position_embeddings() { - ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; - const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; - const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); - - GGML_ASSERT(pos_embd); - - if (height == n_per_side && width == n_per_side) { - return pos_embd; - } - - pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) - pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) - pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) - pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) - pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) - - return pos_embd; + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); } + return inpL; +} - // build vision transformer (ViT) cgraph - // this function should cover most of the models - // if your model has specific features, you should probably duplicate this function - ggml_tensor * build_vit( - ggml_tensor * inp, - int64_t n_pos, - norm_type norm_t, - ffn_op_type ffn_t, - ggml_tensor * learned_pos_embd, - std::function add_pos - ) { - if (learned_pos_embd) { - inp = ggml_add(ctx0, inp, learned_pos_embd); - cb(inp, "pos_embed", -1); - } - ggml_tensor * inpL = inp; - - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - cb(inpL, "pre_ln", -1); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.layers[il]; - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "layer_inp_normed", il); - - // self-attention - { - ggml_tensor * Qcur = nullptr; - ggml_tensor * Kcur = nullptr; - ggml_tensor * Vcur = nullptr; - if (layer.qkv_w != nullptr) { - // fused qkv - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - if (layer.qkv_b != nullptr) { - cur = ggml_add(ctx0, cur, layer.qkv_b); - } - - Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ 0); - - Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, n_embd)); - - Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - - // TODO: q/k norm requires row size == n_embd, while here it's d_head - // we can add support in the future if needed - GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); - - } else { - // separate q, k, v - Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); - if (layer.q_b) { - Qcur = ggml_add(ctx0, Qcur, layer.q_b); - } - - Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); - if (layer.k_b) { - Kcur = ggml_add(ctx0, Kcur, layer.k_b); - } - - Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); - if (layer.v_b) { - Vcur = ggml_add(ctx0, Vcur, layer.v_b); - } - - if (layer.q_norm) { - Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); - cb(Qcur, "Qcur_norm", il); - } - - if (layer.k_norm) { - Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); - cb(Kcur, "Kcur_norm", il); - } - - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); - } - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - if (add_pos) { - Qcur = add_pos(Qcur, layer); - Kcur = add_pos(Kcur, layer); - cb(Qcur, "Qcur_pos", il); - cb(Kcur, "Kcur_pos", il); - } - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); - cb(cur, "attn_out", il); - } - - if (layer.ls_1_w) { - cur = ggml_mul(ctx0, cur, layer.ls_1_w); - cb(cur, "attn_out_scaled", il); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - ffn_t, il); - - cb(cur, "ffn_out", il); - - if (layer.ls_2_w) { - cur = ggml_mul(ctx0, cur, layer.ls_2_w); - cb(cur, "ffn_out_scaled", il); - } - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - inpL = cur; - } - - if (ctx->model.audio_has_avgpool()) { - ggml_tensor * cur = inpL; - cur = ggml_transpose(ctx0, cur); - cur = ggml_cont(ctx0, cur); - cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); - cur = ggml_transpose(ctx0, cur); - cur = ggml_cont(ctx0, cur); - inpL = cur; - } - - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); - } - return inpL; - } - - static ggml_tensor * get_rel_pos( - ggml_context * ctx, - ggml_tensor * rel_pos, // [L, C] - ggml_tensor * indices, // [q_size, k_size] - int q_size, - int k_size - ) { - const int64_t C = rel_pos->ne[0]; // channels - const int64_t L = rel_pos->ne[1]; // length - - GGML_ASSERT(indices != nullptr); - GGML_ASSERT(indices->type == GGML_TYPE_I32); - GGML_ASSERT(indices->ne[0] == k_size); - GGML_ASSERT(indices->ne[1] == q_size); - - const auto max_rel_dist = 2*std::max(q_size, k_size) - 1; - ggml_tensor * cur = rel_pos; - - if (max_rel_dist != L) { - // Linear interpolation - int64_t ne0 = cur->ne[0]; - int64_t ne1 = cur->ne[1]; - int64_t ne2 = cur->ne[2]; - int64_t ne3 = cur->ne[3]; - - cur = ggml_reshape_3d( - ctx, - ggml_cont(ctx, ggml_permute(ctx, cur, 1, 0, 2, 3)), - ne1, 1, ne0*ne2*ne3 - ); - cur = ggml_reshape_4d( - ctx, - ggml_interpolate( - ctx, - cur, - max_rel_dist, 1, ne0*ne2*ne3, 1, - ggml_scale_mode::GGML_SCALE_MODE_BILINEAR - ), - max_rel_dist, ne0, ne2, ne3 - ); - cur = ggml_cont(ctx, ggml_permute(ctx, cur, 1, 0, 2, 3)); - } - - // Flatten indices to 1D for ggml_get_rows - int qk = q_size * k_size; - - cur = ggml_reshape_3d( - ctx, - ggml_get_rows(ctx, cur, ggml_reshape_1d(ctx, indices, qk)), - C, k_size, q_size - ); - - return cur; // [C, k_size, q_size] - } - - // Implementation based on approach suggested by Acly - // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 - static ggml_tensor* window_partition(ggml_context* ctx, ggml_tensor* x, int window) { - auto [c, w, h, b] = x->ne; - // same as - // x = ggml_win_part(m, x, window); - // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]); - - int64_t px = (window - w % window) % window; - int64_t py = (window - h % window) % window; - int64_t npw = (w + px) / window; - int64_t nph = (h + py) / window; - - if (px > 0 || py > 0) { - x = ggml_pad(ctx, x, 0, int(px), int(py), 0); - } - x = ggml_reshape_4d(ctx, x, c * window, npw, window, nph * b); - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); - x = ggml_reshape_4d(ctx, x, c, window, window, npw * nph * b); - return x; - } - - // Implementation based on approach suggested by Acly - // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 - static ggml_tensor* window_unpartition(ggml_context* m, ggml_tensor* x, int w, int h, int window) { - int64_t c = x->ne[0]; - // same as - // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]); - // x = ggml_win_unpart(m, x, w, h, window); - - int64_t px = (window - w % window) % window; - int64_t py = (window - h % window) % window; - int64_t npw = (w + px) / window; - int64_t nph = (h + py) / window; - - int64_t b = x->ne[3] / (npw * nph); - x = ggml_reshape_4d(m, x, c * window, window, npw, nph * b); - x = ggml_cont(m, ggml_permute(m, x, 0, 2, 1, 3)); - x = ggml_reshape_4d(m, x, c, w + px, h + py, b); - x = ggml_view_4d(m, x, x->ne[0], w, h, x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0); - x = ggml_cont(m, x); - return x; - } // build the input after conv2d (inp_raw --> patches) // returns tensor with shape [n_embd, n_patches] - ggml_tensor * build_inp() { - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); - inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); - if (model.patch_bias) { - inp = ggml_add(ctx0, inp, model.patch_bias); - cb(inp, "patch_bias", -1); - } - return inp; + ggml_tensor * clip_graph::build_inp() { + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + return inp; +} + +ggml_tensor * clip_graph::build_inp_raw(int channels) { + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + return inp_raw; +} + +ggml_tensor * clip_graph::build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const { + + cur = type == NORM_TYPE_RMS + ? ggml_rms_norm(ctx0, cur, norm_eps) + : ggml_norm(ctx0, cur, norm_eps); + + if (mw || mb) { + cb(cur, "norm", il); } - ggml_tensor * build_inp_raw(int channels = 3) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - return inp_raw; - } - - ggml_tensor * build_norm( - ggml_tensor * cur, - ggml_tensor * mw, - ggml_tensor * mb, - norm_type type, - float norm_eps, - int il) const { - - cur = type == NORM_TYPE_RMS - ? ggml_rms_norm(ctx0, cur, norm_eps) - : ggml_norm(ctx0, cur, norm_eps); - - if (mw || mb) { - cb(cur, "norm", il); - } - - if (mw) { - cur = ggml_mul(ctx0, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } - } - + if (mw) { + cur = ggml_mul(ctx0, cur, mw); if (mb) { - cur = ggml_add(ctx0, cur, mb); + cb(cur, "norm_w", il); } - - return cur; } - ggml_tensor * build_ffn( - ggml_tensor * cur, - ggml_tensor * up, - ggml_tensor * up_b, - ggml_tensor * gate, - ggml_tensor * gate_b, - ggml_tensor * down, - ggml_tensor * down_b, - ffn_op_type type_op, - int il) const { - - ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx0, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - if (gate) { - cur = ggml_mul_mat(ctx0, gate, cur); - cb(cur, "ffn_gate", il); - - if (gate_b) { - cur = ggml_add(ctx0, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } - } else { - cur = tmp; - } - - // we only support parallel ffn for now - switch (type_op) { - case FFN_SILU: - if (gate) { - cur = ggml_swiglu_split(ctx0, cur, tmp); - cb(cur, "ffn_swiglu", il); - } else { - cur = ggml_silu(ctx0, cur); - cb(cur, "ffn_silu", il); - } break; - case FFN_GELU: - if (gate) { - cur = ggml_geglu_split(ctx0, cur, tmp); - cb(cur, "ffn_geglu", il); - } else { - cur = ggml_gelu(ctx0, cur); - cb(cur, "ffn_gelu", il); - } break; - case FFN_GELU_ERF: - if (gate) { - cur = ggml_geglu_erf_split(ctx0, cur, tmp); - cb(cur, "ffn_geglu_erf", il); - } else { - cur = ggml_gelu_erf(ctx0, cur); - cb(cur, "ffn_gelu_erf", il); - } break; - case FFN_GELU_QUICK: - if (gate) { - cur = ggml_geglu_quick_split(ctx0, cur, tmp); - cb(cur, "ffn_geglu_quick", il); - } else { - cur = ggml_gelu_quick(ctx0, cur); - cb(cur, "ffn_gelu_quick", il); - } break; - } - - if (down) { - cur = ggml_mul_mat(ctx0, down, cur); - } - - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx0, cur, down_b); - } - - return cur; + if (mb) { + cur = ggml_add(ctx0, cur, mb); } - ggml_tensor * build_attn( - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_mask, - float kq_scale, - int il) const { - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, q_cur); - ggml_build_forward_expand(gf, k_cur); - ggml_build_forward_expand(gf, v_cur); + return cur; +} - ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); - //cb(q, "q", il); +ggml_tensor * clip_graph::build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const { - ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); - //cb(k, "k", il); + ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; + cb(tmp, "ffn_up", il); - ggml_tensor * cur; + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } - if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { - ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + if (gate) { + cur = ggml_mul_mat(ctx0, gate, cur); + cb(cur, "ffn_gate", il); - k = ggml_cast(ctx0, k, GGML_TYPE_F16); - v = ggml_cast(ctx0, v, GGML_TYPE_F16); + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } - cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + // we only support parallel ffn for now + switch (type_op) { + case FFN_SILU: + if (gate) { + cur = ggml_swiglu_split(ctx0, cur, tmp); + cb(cur, "ffn_swiglu", il); + } else { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case FFN_GELU: + if (gate) { + cur = ggml_geglu_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu", il); + } else { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + } break; + case FFN_GELU_ERF: + if (gate) { + cur = ggml_geglu_erf_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_erf", il); + } else { + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "ffn_gelu_erf", il); + } break; + case FFN_GELU_QUICK: + if (gate) { + cur = ggml_geglu_quick_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_quick", il); + } else { + cur = ggml_gelu_quick(ctx0, cur); + cb(cur, "ffn_gelu_quick", il); + } break; + } - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); + if (down) { + cur = ggml_mul_mat(ctx0, down, cur); + } - } else { - ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); - v = ggml_cont(ctx0, v); + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + return cur; +} + +ggml_tensor * clip_graph::build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * cur; + + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + + k = ggml_cast(ctx0, k, GGML_TYPE_F16); + v = ggml_cast(ctx0, v, GGML_TYPE_F16); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); + + } else { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); // F32 may not needed for vision encoders? // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); - } - - cb(cur, "kqv_out", il); - - if (wo) { - cur = ggml_mul_mat(ctx0, wo, cur); - } - - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } - - return cur; + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); } - // implementation of the 2D RoPE without adding a new op in ggml - // this is not efficient (use double the memory), but works on all backends - // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 - static ggml_tensor * build_rope_2d( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * pos_a, // first half - ggml_tensor * pos_b, // second half - const float freq_base, - const bool interleave_freq - ) { - const int64_t n_dim = cur->ne[0]; - const int64_t n_head = cur->ne[1]; - const int64_t n_pos = cur->ne[2]; + cb(cur, "kqv_out", il); - // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) - // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 - // first half of cur will use 1e-0, 1e-2 (even) - // second half of cur will use 1e-1, 1e-3 (odd) - // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even - // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) - // then for the second half, we use freq_scale to shift the inv_freq - // ^ why? replace (2i) with (2i+1) in the above equation - const float freq_scale_odd = interleave_freq - ? std::pow(freq_base, (float)-2/n_dim) - : 1.0; - - // first half - ggml_tensor * first; - { - first = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, - ggml_row_size(cur->type, n_dim), - ggml_row_size(cur->type, n_dim*n_head), - 0); - first = ggml_rope_ext( - ctx0, - first, - pos_a, // positions - nullptr, // freq factors - n_dim/2, // n_dims - 0, 0, freq_base, - 1.0f, 0.0f, 1.0f, 0.0f, 0.0f - ); - } - - // second half - ggml_tensor * second; - { - second = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, - ggml_row_size(cur->type, n_dim), - ggml_row_size(cur->type, n_dim*n_head), - n_dim/2 * ggml_element_size(cur)); - second = ggml_rope_ext( - ctx0, - second, - pos_b, // positions - nullptr, // freq factors - n_dim/2, // n_dims - 0, 0, freq_base, - freq_scale_odd, - 0.0f, 1.0f, 0.0f, 0.0f - ); - } - - cur = ggml_concat(ctx0, first, second, 0); - return cur; + if (wo) { + cur = ggml_mul_mat(ctx0, wo, cur); } - // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) - // support dynamic resolution - ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { - GGML_ASSERT(scale_factor > 1); - - const int n_embd = cur->ne[0]; - int width = img.nx / patch_size; - int height = img.ny / patch_size; - - // pad width and height to factor - const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; - const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; - cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); - if (pad_width || pad_height) { - cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); - width += pad_width; - height += pad_height; - } - - // unshuffle h - cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - // unshuffle w - cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cb(cur, "pixel_shuffle", -1); - - return cur; + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); } - ggml_tensor * build_sam(ggml_tensor * inp_raw) { - const int n_embd = hparams.sam_n_embd; - const int n_layer = hparams.sam_n_layer; - const int n_heads = hparams.sam_n_head; - const int d_heads = n_embd / n_heads; - const int window = hparams.attn_window_size; + return cur; +} - ggml_tensor * inpL; +// implementation of the 2D RoPE without adding a new op in ggml +// this is not efficient (use double the memory), but works on all backends +// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 +ggml_tensor * clip_graph::build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq +) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; - inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); - inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); - inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = interleave_freq + ? std::pow(freq_base, (float)-2/n_dim) + : 1.0; - ggml_tensor * rel_pos_indices_local; - ggml_tensor * rel_pos_indices_global; - - rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); - rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); - ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); - ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); - ggml_set_input(rel_pos_indices_local); - ggml_set_input(rel_pos_indices_global); - - ggml_tensor * cur; - const auto tgt_size = inpL->ne[1]; - const auto str_size = model.pos_embed->ne[1]; - - if (str_size != tgt_size) { - ggml_tensor * old_pos_embed = nullptr; - old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); - ggml_tensor * new_pos_embed = ggml_interpolate( - ctx0, - old_pos_embed, - tgt_size, - tgt_size, - n_embd, - 1, - ggml_scale_mode::GGML_SCALE_MODE_BICUBIC - ); - new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); - cur = ggml_add(ctx0, inpL, new_pos_embed); - } else { - cur = ggml_add(ctx0, inpL, model.pos_embed); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.sam_layers[il]; - ggml_tensor * shortcut = cur; - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - - const int64_t w0 = cur->ne[1]; - const int64_t h0 = cur->ne[2]; - - ggml_tensor * indices; - - if (hparams.is_global_attn(il)) { - indices = rel_pos_indices_global; - } else { - // local attention layer - apply window partition - cur = window_partition(ctx0, cur, window); - indices = rel_pos_indices_local; - } - - const int64_t W = cur->ne[1]; - const int64_t H = cur->ne[2]; - // self-attention - { - const int B = cur->ne[3]; - - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); - cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape - cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W*H, B); - - ggml_tensor * Q; - ggml_tensor * K; - ggml_tensor * V; - - Q = ggml_view_3d (ctx0, cur, n_embd, W*H, B, cur->nb[2], cur->nb[3], 0*cur->nb[1]); - Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W*H, B); - - K = ggml_view_3d (ctx0, cur, n_embd, W*H, B, cur->nb[2], cur->nb[3], 1*cur->nb[1]); - K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W*H, B); - - V = ggml_view_3d (ctx0, cur, n_embd, W*H, B, cur->nb[2], cur->nb[3], 2*cur->nb[1]); - V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W*H, B); - - ggml_tensor * mask; - ggml_tensor * rw; - ggml_tensor * rh; - ggml_tensor * qr; - - rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] - rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] - qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); - qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); - - - rw = ggml_mul_mat (ctx0, rw, ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] - rw = ggml_cont (ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] - rw = ggml_reshape_4d(ctx0, rw, W, 1, W*H, n_heads*B); - rw = ggml_repeat_4d (ctx0, rw, W, H, W*H, n_heads*B); - rh = ggml_mul_mat (ctx0, rh, qr); // [B*n_heads, H, W, H] - rh = ggml_reshape_4d(ctx0, rh, 1, H, W*H, n_heads*B); - mask = ggml_add (ctx0, rw, rh); // [B*n_heads, H*W, H, W] - mask = ggml_reshape_4d(ctx0, mask, W*H, W*H, n_heads, B); - mask = ggml_cast (ctx0, mask, GGML_TYPE_F16); - - float scale = 1.0f / sqrtf((float)d_heads); - - cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, - il); // [B, H*W, n_embd] - cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); - } - - if (hparams.is_global_attn(il) == false) { - // local attention layer - reverse window partition - cur = window_unpartition(ctx0, cur, w0, h0, window); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, shortcut); - - ggml_tensor * inpFF = cur; - - // layernorm2 - cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - - // ffn - cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, - layer.ff_down_b, hparams.ffn_op, il); - - // residual 2 - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "sam_layer_out", il); - } - - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - - cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - - cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); - cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); - cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); - - cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); - cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); - cb(cur, "sam_output", -1); - - ggml_build_forward_expand(gf, cur); - return cur; + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_a, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); } - ggml_tensor * build_dsocr_clip(ggml_tensor * patch_embeds) { - ggml_tensor * inp; - - inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds)); - inp = ggml_reshape_2d(ctx0, inp, inp->ne[0]*inp->ne[1], inp->ne[2]); - inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - - ggml_tensor * new_pos_embd = ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings)); - - int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] - const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); - const auto src_size = static_cast(std::sqrt(n_pos - 1)); - - if (tgt_size != src_size) { - ggml_tensor * old_pos_embd; - ggml_tensor * cls_tok; - - old_pos_embd = ggml_view_2d( - ctx0, new_pos_embd, - new_pos_embd->ne[0], src_size * src_size, - ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0 - ); - cls_tok = ggml_view_2d( - ctx0, new_pos_embd, - new_pos_embd->ne[0], 1, - ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size - ); - new_pos_embd = ggml_interpolate(ctx0, - old_pos_embd, - tgt_size, - tgt_size, - new_pos_embd->ne[0], 1, GGML_SCALE_MODE_BICUBIC - ); - new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1); - new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1); - n_pos = tgt_size * tgt_size + 1; - } - - // add CLS token - inp = ggml_concat(ctx0, model.class_embedding, inp, 1); - - // for selecting learned pos embd, used by ViT - ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32); - ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions); - - ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK, - learned_pos_embd, nullptr); - - ggml_build_forward_expand(gf, cur); - - return cur; + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + n_dim/2 * ggml_element_size(cur)); + second = ggml_rope_ext( + ctx0, + second, + pos_b, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); } -}; + + cur = ggml_concat(ctx0, first, second, 0); + return cur; +} + +// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) +// support dynamic resolution +ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + GGML_ASSERT(scale_factor > 1); + + const int n_embd = cur->ne[0]; + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // unshuffle w + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cb(cur, "pixel_shuffle", -1); + + return cur; +} + + static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); - clip_graph graph(ctx, *imgs.entries[0]); - ggml_cgraph * res; + const clip_image_f32 & img = *imgs.entries[0]; + std::unique_ptr builder; switch (ctx->proj_type()) { case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_JANUS_PRO: { - res = graph.build_siglip(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { - res = graph.build_pixtral(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: { - res = graph.build_qwen2vl(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_QWEN3VL: { - res = graph.build_qwen3vl(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_MINICPMV: { - res = graph.build_minicpmv(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_INTERNVL: { - res = graph.build_internvl(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_LLAMA4: { - res = graph.build_llama4(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_QWEN2A: { - res = graph.build_whisper_enc(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_KIMIVL: { - res = graph.build_kimivl(); - } break; - case PROJECTOR_TYPE_JANUS_PRO: - { - res = graph.build_siglip(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_COGVLM: { - res = graph.build_cogvlm(); + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_GLM_EDGE: + { + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: - { - res = graph.build_deepseek_ocr(); - } break; - default: { - res = graph.build_llava(); + builder = std::make_unique(ctx, img); } break; + default: + GGML_ABORT("missing cgraph builder"); } - return res; + + return builder->build(); } +// +// clip_model_loader +// + struct clip_model_loader { ggml_context_ptr ctx_meta; gguf_context_ptr ctx_gguf; diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index e4f6566e15..144a3d7b44 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -7,6 +7,8 @@ // !!! Internal header, to be used by mtmd only !!! +#define MTMD_INTERNAL_HEADER + struct clip_ctx; struct clip_image_size { diff --git a/tools/mtmd/models/cogvlm.cpp b/tools/mtmd/models/cogvlm.cpp new file mode 100644 index 0000000000..d5b739c687 --- /dev/null +++ b/tools/mtmd/models/cogvlm.cpp @@ -0,0 +1,98 @@ +#include "models.h" + +ggml_cgraph * clip_graph_cogvlm::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // build input and concatenate class embedding + ggml_tensor * inp = build_inp(); + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + ggml_tensor * inpL = inp; + + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], n_embd * sizeof(float)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 2 * n_embd * sizeof(float)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "layer_out", il); + inpL = cur; + + } + + // remove CLS token (like build_llama4 does) + ggml_tensor * cur = ggml_view_2d(ctx0, inpL, + n_embd, n_patches, + ggml_row_size(inpL->type, n_embd), 0); + + // Multiply with mm_model_proj + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + + // Apply layernorm, weight, bias + cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + + // Apply GELU + cur = ggml_gelu_inplace(ctx0, cur); + + // Branch 1: multiply with mm_h_to_4h_w + ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + + // Branch 2: multiply with mm_gate_w + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + + // Apply silu + gate = ggml_swiglu_split(ctx0, gate, h_to_4h); + + // Apply mm_4h_to_h_w + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + + // Concatenate with boi and eoi + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp new file mode 100644 index 0000000000..156b917b9a --- /dev/null +++ b/tools/mtmd/models/deepseekocr.cpp @@ -0,0 +1,325 @@ +#include "models.h" + +// Implementation based on approach suggested by Acly +// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 +static ggml_tensor * window_partition(ggml_context * ctx0, ggml_tensor * x, const int window) { + auto [c, w, h, b] = x->ne; + // same as + // x = ggml_win_part(m, x, window); + // x = ggml_reshape_3d(m, x, c, window * window, x->ne[3]); + + const int64_t px = (window - w % window) % window; + const int64_t py = (window - h % window) % window; + const int64_t npw = (w + px) / window; + const int64_t nph = (h + py) / window; + + ggml_tensor * cur = x; + if (px > 0 || py > 0) { + cur = ggml_pad(ctx0, cur, 0, static_cast(px), static_cast(py), 0); + } + cur = ggml_reshape_4d(ctx0, cur, c * window, npw, window, nph * b); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_4d(ctx0, cur, c, window, window, npw * nph * b); + return cur; +} + +// Implementation based on approach suggested by Acly +// See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091 +static ggml_tensor * window_unpartition(ggml_context * ctx0, + ggml_tensor * x, + const int w, + const int h, + const int window) { + const int64_t c = x->ne[0]; + // same as + // x = ggml_reshape_4d(m, x, c, window, window, x->ne[2]); + // x = ggml_win_unpart(m, x, w, h, window); + + const int64_t px = (window - w % window) % window; + const int64_t py = (window - h % window) % window; + const int64_t npw = (w + px) / window; + const int64_t nph = (h + py) / window; + + const int64_t b = x->ne[3] / (npw * nph); + ggml_tensor * cur = x; + cur = ggml_reshape_4d(ctx0, cur, c * window, window, npw, nph * b); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_4d(ctx0, cur, c, w + px, h + py, b); + cur = ggml_view_4d(ctx0, cur, cur->ne[0], w, h, cur->ne[3], cur->nb[1], cur->nb[2], cur->nb[3], 0); + cur = ggml_cont(ctx0, cur); + return cur; +} + +static ggml_tensor * get_rel_pos(ggml_context * ctx0, + ggml_tensor * rel_pos, // [L, C] + ggml_tensor * indices, // [q_size, k_size] + const int q_size, + const int k_size) { + const int64_t C = rel_pos->ne[0]; // channels + const int64_t L = rel_pos->ne[1]; // length + + GGML_ASSERT(indices != nullptr); + GGML_ASSERT(indices->type == GGML_TYPE_I32); + GGML_ASSERT(indices->ne[0] == k_size); + GGML_ASSERT(indices->ne[1] == q_size); + + const auto max_rel_dist = 2 * std::max(q_size, k_size) - 1; + ggml_tensor * cur = rel_pos; + + if (max_rel_dist != L) { + // Linear interpolation + const int64_t ne0 = cur->ne[0]; + const int64_t ne1 = cur->ne[1]; + const int64_t ne2 = cur->ne[2]; + const int64_t ne3 = cur->ne[3]; + + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)), ne1, 1, ne0 * ne2 * ne3); + cur = ggml_reshape_4d( + ctx0, ggml_interpolate(ctx0, cur, max_rel_dist, 1, ne0 * ne2 * ne3, 1, GGML_SCALE_MODE_BILINEAR), + max_rel_dist, ne0, ne2, ne3); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3)); + } + + // Flatten indices to 1D for ggml_get_rows + const int qk = q_size * k_size; + + cur = ggml_reshape_3d(ctx0, ggml_get_rows(ctx0, cur, ggml_reshape_1d(ctx0, indices, qk)), C, k_size, q_size); + + return cur; // [C, k_size, q_size] +} + +ggml_cgraph * clip_graph_deepseekocr::build() { + //patch embedding + ggml_tensor * inp_raw = build_inp_raw(); + //ggml_tensor * sam_out = build_sam(inp_raw); + + ggml_tensor * sam_out; + // Building SAM + { + const int n_embd = hparams.sam_n_embd; + const int n_layer = hparams.sam_n_layer; + const int n_heads = hparams.sam_n_head; + const int d_heads = n_embd / n_heads; + const int window = hparams.attn_window_size; + + ggml_tensor * inpL; + + inpL = ggml_conv_2d_sk_p0(ctx0, model.patch_embed_proj_w, inp_raw); + inpL = ggml_add(ctx0, inpL, ggml_reshape_3d(ctx0, model.patch_embed_proj_b, 1, 1, n_embd)); + inpL = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 1, 2, 0, 3)); + + ggml_tensor * rel_pos_indices_local; + ggml_tensor * rel_pos_indices_global; + + rel_pos_indices_local = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, window, window); + rel_pos_indices_global = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, inpL->ne[1], inpL->ne[2]); + ggml_set_name(rel_pos_indices_local, "rel_pos_indices_local"); + ggml_set_name(rel_pos_indices_global, "rel_pos_indices_global"); + ggml_set_input(rel_pos_indices_local); + ggml_set_input(rel_pos_indices_global); + + ggml_tensor * cur; + const auto tgt_size = inpL->ne[1]; + const auto str_size = model.pos_embed->ne[1]; + + if (str_size != tgt_size) { + ggml_tensor * old_pos_embed = nullptr; + old_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, model.pos_embed, 2, 0, 1, 3)); + ggml_tensor * new_pos_embed = + ggml_interpolate(ctx0, old_pos_embed, tgt_size, tgt_size, n_embd, 1, GGML_SCALE_MODE_BICUBIC); + new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 1, 2, 0, 3)); + cur = ggml_add(ctx0, inpL, new_pos_embed); + } else { + cur = ggml_add(ctx0, inpL, model.pos_embed); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.sam_layers[il]; + ggml_tensor * shortcut = cur; + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + const int64_t w0 = cur->ne[1]; + const int64_t h0 = cur->ne[2]; + + ggml_tensor * indices; + + if (hparams.is_global_attn(il)) { + indices = rel_pos_indices_global; + } else { + // local attention layer - apply window partition + cur = window_partition(ctx0, cur, window); + indices = rel_pos_indices_local; + } + + const int64_t W = cur->ne[1]; + const int64_t H = cur->ne[2]; + // self-attention + { + const int B = cur->ne[3]; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape + cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); + + ggml_tensor * Q; + ggml_tensor * K; + ggml_tensor * V; + + Q = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 0 * cur->nb[1]); + Q = ggml_reshape_4d(ctx0, ggml_cont(ctx0, Q), d_heads, n_heads, W * H, B); + + K = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 1 * cur->nb[1]); + K = ggml_reshape_4d(ctx0, ggml_cont(ctx0, K), d_heads, n_heads, W * H, B); + + V = ggml_view_3d(ctx0, cur, n_embd, W * H, B, cur->nb[2], cur->nb[3], 2 * cur->nb[1]); + V = ggml_reshape_4d(ctx0, ggml_cont(ctx0, V), d_heads, n_heads, W * H, B); + + ggml_tensor * mask; + ggml_tensor * rw; + ggml_tensor * rh; + ggml_tensor * qr; + + rw = get_rel_pos(ctx0, layer.rel_pos_w, indices, W, W); // [W, W, C] + rh = get_rel_pos(ctx0, layer.rel_pos_h, indices, H, H); // [H, H, C] + qr = ggml_permute(ctx0, Q, 0, 2, 1, 3); + qr = ggml_reshape_4d(ctx0, ggml_cont(ctx0, qr), d_heads, W, H, B * n_heads); + + rw = ggml_mul_mat(ctx0, rw, + ggml_cont(ctx0, ggml_permute(ctx0, qr, 0, 2, 1, 3))); // [B*n_heads, W, H, W] + rw = ggml_cont(ctx0, ggml_permute(ctx0, rw, 0, 2, 1, 3)); // [B*n_heads, H, W, W] + rw = ggml_reshape_4d(ctx0, rw, W, 1, W * H, n_heads * B); + rw = ggml_repeat_4d(ctx0, rw, W, H, W * H, n_heads * B); + rh = ggml_mul_mat(ctx0, rh, qr); // [B*n_heads, H, W, H] + rh = ggml_reshape_4d(ctx0, rh, 1, H, W * H, n_heads * B); + mask = ggml_add(ctx0, rw, rh); // [B*n_heads, H*W, H, W] + mask = ggml_reshape_4d(ctx0, mask, W * H, W * H, n_heads, B); + mask = ggml_cast(ctx0, mask, GGML_TYPE_F16); + + const float scale = 1.0f / sqrtf(static_cast(d_heads)); + + cur = build_attn(layer.o_w, layer.o_b, Q, K, V, mask, scale, + il); // [B, H*W, n_embd] + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), n_embd, W, H, B); + } + + if (hparams.is_global_attn(il) == false) { + // local attention layer - reverse window partition + cur = window_unpartition(ctx0, cur, w0, h0, window); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, shortcut); + + ggml_tensor * inpFF = cur; + + // layernorm2 + cur = build_norm(inpFF, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + + // ffn + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, cur, inpFF); + cb(cur, "sam_layer_out", il); + } + + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_0_w, cur, 1, 1, 0, 0, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_1_w, model.neck_1_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.neck_2_w, cur, 1, 1, 1, 1, 1, 1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = build_norm(cur, model.neck_3_w, model.neck_3_b, NORM_TYPE_NORMAL, hparams.eps, -1); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + + cur = ggml_conv_2d(ctx0, model.net_2, cur, 2, 2, 1, 1, 1, 1); + cur = ggml_conv_2d(ctx0, model.net_3, cur, 2, 2, 1, 1, 1, 1); + cb(cur, "sam_output", -1); + + ggml_build_forward_expand(gf, cur); + sam_out = cur; + } + //ggml_tensor * clip_out = build_dsocr_clip(sam_out); + ggml_tensor * clip_out; + // Building DS-OCR CLIP + { + ggml_tensor * inp; + + inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out)); + inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + ggml_tensor * new_pos_embd = + ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings)); + + int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] + const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); + const auto src_size = static_cast(std::sqrt(n_pos - 1)); + + if (tgt_size != src_size) { + ggml_tensor * old_pos_embd; + ggml_tensor * cls_tok; + + old_pos_embd = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], src_size * src_size, + ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0); + cls_tok = ggml_view_2d(ctx0, new_pos_embd, new_pos_embd->ne[0], 1, + ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size); + new_pos_embd = ggml_interpolate(ctx0, old_pos_embd, tgt_size, tgt_size, new_pos_embd->ne[0], 1, + GGML_SCALE_MODE_BICUBIC); + new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1); + new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1); + n_pos = tgt_size * tgt_size + 1; + } + + // add CLS token + inp = ggml_concat(ctx0, model.class_embedding, inp, 1); + + // for selecting learned pos embd, used by ViT + ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32); + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions); + + ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_QUICK, learned_pos_embd, nullptr); + + ggml_build_forward_expand(gf, cur); + clip_out = cur; + } + + const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; + + sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); + sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); + clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); + + ggml_tensor * cur; + cur = ggml_concat(ctx0, clip_out, sam_out, 0); + cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); + cur = ggml_cont(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.fc_w, cur); + cur = ggml_add(ctx0, cur, model.fc_b); + + const auto h = static_cast(std::sqrt(static_cast(cur->ne[1]))); + const auto w = h; + const auto n_dim = cur->ne[0]; + + ggml_tensor * imgnl; + ggml_tensor * vs; + + imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); + vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) + cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); + cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); + cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) + + cb(cur, "dsocr_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/internvl.cpp b/tools/mtmd/models/internvl.cpp new file mode 100644 index 0000000000..9aded3b97c --- /dev/null +++ b/tools/mtmd/models/internvl.cpp @@ -0,0 +1,69 @@ +#include "models.h" + +ggml_cgraph * clip_graph_internvl::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; + ggml_tensor * inp = build_inp(); + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // The larger models use a different ViT, which uses RMS norm instead of layer norm + // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 + norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) + ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) + : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + + ggml_tensor * cur = build_vit( + inp, n_pos, + norm_t, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = n_patches_y; + const int width = n_patches_x; + GGML_ASSERT(scale_factor > 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + cur->ne[1] * cur->ne[2]); + } + + // projector (always using GELU activation) + { + // projector LayerNorm uses pytorch's default eps = 1e-5 + // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_3_w, model.mm_3_b, + FFN_GELU, + -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/kimivl.cpp b/tools/mtmd/models/kimivl.cpp new file mode 100644 index 0000000000..0a06f5090e --- /dev/null +++ b/tools/mtmd/models/kimivl.cpp @@ -0,0 +1,63 @@ +#include "models.h" + +ggml_cgraph * clip_graph_kimivl::build() { + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + cur = ggml_view_2d(ctx0, cur, + n_embd, cur->ne[1] * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/llama4.cpp b/tools/mtmd/models/llama4.cpp new file mode 100644 index 0000000000..30d1df5bcd --- /dev/null +++ b/tools/mtmd/models/llama4.cpp @@ -0,0 +1,96 @@ +#include "models.h" + +ggml_cgraph * clip_graph_llama4::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * inp = build_inp_raw(); + + // Llama4UnfoldConvolution + { + ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, + patch_size, patch_size, 3, n_embd); + inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + cb(inp, "patch_conv", -1); + } + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312 + // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441 + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + add_pos); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + // based on Llama4VisionPixelShuffleMLP + // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + GGML_ASSERT(scale_factor > 0); + GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images + cur = ggml_reshape_4d(ctx0, cur, + n_embd * scale_factor, + n_patches_x / scale_factor, + n_patches_y, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches_x / scale_factor, + n_patches_y / scale_factor, + bsz); + //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches / scale_factor / scale_factor); + cb(cur, "pixel_shuffle", -1); + } + + // based on Llama4VisionMLP2 (always uses GELU activation, no bias) + { + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); + cur = ggml_gelu(ctx0, cur); + cb(cur, "adapter_mlp", -1); + } + + // Llama4MultiModalProjector + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cb(cur, "projected", -1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp new file mode 100644 index 0000000000..0bfb5f05f6 --- /dev/null +++ b/tools/mtmd/models/llava.cpp @@ -0,0 +1,374 @@ +#include "models.h" + +// this graph is used by llava, granite and glm +// due to having embedding_stack (used by granite), we cannot reuse build_vit +ggml_cgraph * clip_graph_llava::build() { + const int batch_size = 1; + const int n_pos = n_patches + (model.class_embedding ? 1 : 0); + + GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); + + // Calculate the deepest feature layer based on hparams and projector type + int max_feature_layer = n_layer; + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int il_last = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) { + il_last += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; + } + + ggml_tensor * inp = build_inp(); + + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + } + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); + cb(inpL, "pre_ln", -1); + } + + std::vector embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + + // loop over layers + for (int il = 0; il < max_feature_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(cur); + } + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + + ggml_tensor * embeddings = inpL; + + // process vision feature layers (used by granite) + { + // final layer is a vision feature layer + if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + embedding_stack.push_back(inpL); + } + + // If feature layers are explicitly set, stack them (if we have multiple) + if (!embedding_stack.empty()) { + embeddings = embedding_stack[0]; + for (size_t i = 1; i < embedding_stack.size(); i++) { + embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); + } + } + } + + // llava projector (also used by granite) + if (hparams.has_llava_projector) { + embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(patches, "patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + embeddings = ggml_get_rows(ctx0, embeddings, patches); + + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); + if (model.mm_2_w) { + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + } + else if (proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); + mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else if (proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } + else { + GGML_ABORT("fatal error"); + } + } + + // glm projector + else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); + embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_swiglu_split(ctx0, embeddings, x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); + } + // arrangement of BOI/EOI token embeddings + // note: these embeddings are not present in text model, hence we cannot process them as text tokens + // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 + { + embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI + embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI + } + } + + else { + GGML_ABORT("llava: unknown projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/minicpmv.cpp b/tools/mtmd/models/minicpmv.cpp new file mode 100644 index 0000000000..3594ea29fa --- /dev/null +++ b/tools/mtmd/models/minicpmv.cpp @@ -0,0 +1,114 @@ +#include "models.h" + +ggml_cgraph * clip_graph_minicpmv::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + const int n_embd_proj = n_mmproj_embd; + + // position embeddings for the projector (not for ViT) + // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70 + // base frequency omega + ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4); + ggml_set_name(omega, "omega"); + ggml_set_input(omega); + + // 2D input positions (using float for sinusoidal embeddings) + ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * inp = build_inp(); + ggml_tensor * embeddings = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + // resampler projector (it is just another transformer) + + ggml_tensor * q = model.mm_model_query; + ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + + // norm + q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); + v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); + + // calculate sinusoidal pos embd + ggml_tensor * pos_embed = nullptr; + { + // outer product + ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows + ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w); + ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h); + // sin and cos + ggml_tensor * pos_embd_x = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_x), + ggml_cos(ctx0, theta_x), + 0 // concat on first dim + ); + ggml_tensor * pos_embd_y = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_y), + ggml_cos(ctx0, theta_y), + 0 // concat on first dim + ); + pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0); + } + + // k = v + pos_embed + ggml_tensor * k = ggml_add(ctx0, v, pos_embed); + + // attention + { + const int d_head = 128; + int n_head = n_embd_proj/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values + int num_query = hparams.minicpmv_query_num; + ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + model.mm_model_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + model.mm_model_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + model.mm_model_attn_v_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); + K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); + V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); + + cb(Q, "resampler_Q", -1); + cb(K, "resampler_K", -1); + cb(V, "resampler_V", -1); + + float resampler_kq_scale = 1.0f/ sqrtf(float(d_head)); + embeddings = build_attn( + model.mm_model_attn_o_w, + model.mm_model_attn_o_b, + Q, K, V, nullptr, resampler_kq_scale, -1); + cb(embeddings, "resampler_attn_out", -1); + } + // layernorm + embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); + + // projection + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h new file mode 100644 index 0000000000..bf020516fb --- /dev/null +++ b/tools/mtmd/models/models.h @@ -0,0 +1,63 @@ +#pragma once + +#include "../clip-graph.h" + +struct clip_graph_siglip : clip_graph { + clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_pixtral : clip_graph { + clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen2vl : clip_graph { + clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen3vl : clip_graph { + clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_minicpmv : clip_graph { + clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_internvl : clip_graph { + clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llama4 : clip_graph { + clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_kimivl : clip_graph { + clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_cogvlm : clip_graph { + clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llava : clip_graph { + clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_whisper_enc : clip_graph { + clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_deepseekocr : clip_graph { + clip_graph_deepseekocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/models/pixtral.cpp b/tools/mtmd/models/pixtral.cpp new file mode 100644 index 0000000000..a849210b53 --- /dev/null +++ b/tools/mtmd/models/pixtral.cpp @@ -0,0 +1,86 @@ +#include "models.h" + +ggml_cgraph * clip_graph_pixtral::build() { + const int n_merge = hparams.n_merge; + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, // no learned pos embd + add_pos); + + // mistral small 3.1 patch merger + // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 + if (model.mm_patch_merger_w) { + GGML_ASSERT(hparams.n_merge > 0); + + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + } + + // LlavaMultiModalProjector (always using GELU activation) + { + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + } + + // arrangement of the [IMG_BREAK] token + if (model.token_embd_img_break) { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] + + const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_total = p_x * p_y; + const int n_embd_text = cur->ne[0]; + const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + tmp = ggml_concat(ctx0, tmp, tok, 1); + cur = ggml_view_2d(ctx0, tmp, + n_embd_text, n_tokens_output, + ggml_row_size(tmp->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp new file mode 100644 index 0000000000..85f158bb1c --- /dev/null +++ b/tools/mtmd/models/qwen2vl.cpp @@ -0,0 +1,183 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen2vl::build() { + GGML_ASSERT(model.patch_bias == nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL + ? NORM_TYPE_RMS // qwen 2.5 vl + : NORM_TYPE_NORMAL; // qwen 2 vl + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp new file mode 100644 index 0000000000..35a42cb84d --- /dev/null +++ b/tools/mtmd/models/qwen3vl.cpp @@ -0,0 +1,191 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen3vl::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + if (model.patch_bias != nullptr) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "inp_pos_emb", -1); + + ggml_tensor * inpL = inp; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] + ggml_tensor * deepstack_features = nullptr; + const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); + + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + if (layer.has_deepstack()) { + ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size); + feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il); + feat = build_ffn(feat, + layer.deepstack_fc1_w, layer.deepstack_fc1_b, + nullptr, nullptr, + layer.deepstack_fc2_w, layer.deepstack_fc2_b, + ffn_op_type::FFN_GELU, il); + + if(!deepstack_features) { + deepstack_features = feat; + } else { + // concat along the feature dimension + deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0); + } + } + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + ffn_op_type::FFN_GELU, -1); + + embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp new file mode 100644 index 0000000000..191694235d --- /dev/null +++ b/tools/mtmd/models/siglip.cpp @@ -0,0 +1,81 @@ +#include "models.h" + +ggml_cgraph * clip_graph_siglip::build() { + ggml_tensor * inp = build_inp(); + + ggml_tensor * learned_pos_embd = model.position_embeddings; + if (proj_type == PROJECTOR_TYPE_LFM2) { + learned_pos_embd = resize_position_embeddings(); + } + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + if (proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + GGML_ASSERT(n_patches_x == n_patches_y); + const int patches_per_image = n_patches_x; + const int kernel_size = hparams.n_merge; + + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + + // doing a pool2d to reduce the number of output tokens + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + // apply norm before projection + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + + // apply projection + cur = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + cur); + + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { + // pixel_shuffle + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + cur = ggml_mul_mat(ctx0, model.fc_w, cur); + + } else if (proj_type == PROJECTOR_TYPE_LFM2) { + // pixel unshuffle block + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + + } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) { + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + hparams.ffn_op, + -1); + + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp new file mode 100644 index 0000000000..07d378b095 --- /dev/null +++ b/tools/mtmd/models/whisper-enc.cpp @@ -0,0 +1,107 @@ +#include "models.h" + +ggml_cgraph * clip_graph_whisper_enc::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * inp = build_inp_raw(1); + + // conv1d block + { + // convolution + gelu + ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_1_b); + + cur = ggml_gelu_erf(ctx0, cur); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1); + cur = ggml_add(ctx0, cur, model.conv1d_2_b); + + cur = ggml_gelu_erf(ctx0, cur); + // transpose + inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cb(inp, "after_conv1d", -1); + } + + // sanity check (only check one layer, but it should be the same for all) + GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b); + GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b); + GGML_ASSERT(model.layers[0].q_b); + GGML_ASSERT(model.layers[0].v_b); + GGML_ASSERT(!model.layers[0].k_b); // no bias for k + GGML_ASSERT(model.post_ln_w && model.post_ln_b); + + ggml_tensor * pos_embd_selected = ggml_view_2d( + ctx0, model.position_embeddings, + model.position_embeddings->ne[0], n_pos, + model.position_embeddings->nb[1], 0 + ); + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + pos_embd_selected, + nullptr); + + cb(cur, "after_transformer", -1); + + if (model.audio_has_stack_frames()) { + // StackAudioFrames + // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py + int64_t stride = n_embd * hparams.proj_stack_factor; + int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride); + int64_t pad = padded_len - ggml_nelements(cur); + if (pad > 0) { + cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0); + cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); + } + cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, + ggml_row_size(cur->type, stride), 0); + cb(cur, "after_stacked", -1); + } + + if (proj_type == PROJECTOR_TYPE_ULTRAVOX) { + // UltravoxProjector + // pre-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + + // ffn in + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + + // swiglu + // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half + cur = ggml_swiglu_swapped(ctx0, cur); + + // mid-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); + + // ffn out + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + + } else if (proj_type == PROJECTOR_TYPE_QWEN2A) { + // projector + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + } else { + GGML_ABORT("%s: unknown projector type", __func__); + } + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index b7b940affb..0e552347a0 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -6,6 +6,8 @@ #include #include +#define MTMD_INTERNAL_HEADER + #define WHISPER_ASSERT GGML_ASSERT #define WHISPER_SAMPLE_RATE 16000 diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index f0891bba30..902a4b456d 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -32,6 +32,10 @@ #define STB_IMAGE_IMPLEMENTATION #include "stb/stb_image.h" +#ifdef MTMD_INTERNAL_HEADER +#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers" +#endif + // // internal logging functions // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index b3df24c299..9f7e861e92 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -22,6 +22,11 @@ * Issues related to API usage may receive lower priority support. * * For the usage, see an example in mtmd-cli.cpp + * + * For contributors: + * - Make sure the C API is aligned with the libllama C API (as in llama.h) + * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead + * - Keep the API minimal, do not expose internal details unless necessary */ #ifdef LLAMA_SHARED diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 20a9359bed..80db07837a 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -107,6 +107,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra # add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working + add_test_vision "sabafallah/DeepSeek-OCR-GGUF:q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M" add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M" diff --git a/tools/server/README.md b/tools/server/README.md index d6b9b87dcf..073bcd2ccd 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)
(env: LLAMA_ARG_KV_UNIFIED) | | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | -| `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | -| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | -| `--no-escape` | do not process escape sequences | +| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)
(env: LLAMA_ARG_PERF) | +| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N
(env: LLAMA_ARG_ROPE_SCALE) | | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | @@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | -| `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-nr, --no-repack` | disable weight repacking
(env: LLAMA_ARG_NO_REPACK) | -| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | +| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | +| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_HOST) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | +| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | | `--check-tensors` | check model tensor data for invalid values (default: false) | | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | -| `--no-op-offload` | disable offloading host tensor operations to device (default: false) | +| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) | | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | @@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu | -------- | ----------- | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | -| `--context-shift` | enables context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | +| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode
| | `-sp, --special` | special tokens output enabled (default: false) | -| `--no-warmup` | skip warming up the model with an empty run | +| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | -| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | -| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | -| `--mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | -| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | -| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf
(env: LLAMA_ARG_NO_MMPROJ) | -| `--no-mmproj-offload` | do not offload multimodal projector to GPU
(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) | +| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | +| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | +| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)
(env: LLAMA_ARG_MMPROJ_AUTO) | +| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `--override-tensor-draft, -otd =,...` | override tensor buffer type for draft model | @@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | -| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) | +| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | @@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | -| `--slots` | enable slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | -| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | +| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) | | `--models-dir PATH` | directory containing models for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_DIR) | +| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_PRESET) | | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)
(env: LLAMA_ARG_MODELS_MAX) | -| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)
(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) | -| `--no-models-autoload` | disables automatic loading of models (default: enabled)
(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) | -| `--jinja` | use jinja template for chat (default: enabled)

(env: LLAMA_ARG_JINJA) | -| `--no-jinja` | disable jinja template for chat (default: enabled)

(env: LLAMA_ARG_NO_JINJA) | +| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)
(env: LLAMA_ARG_MODELS_AUTOLOAD) | +| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | -| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | +| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | @@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. +For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example: +- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled` +- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled` +- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap + Example usage of docker compose with environment variables: ```yml diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 2db04e9522..3fd631b77a 100644 Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json index 9c1c2499cf..4f37b308b1 100644 --- a/tools/server/webui/package-lock.json +++ b/tools/server/webui/package-lock.json @@ -41,7 +41,7 @@ "@tailwindcss/vite": "^4.0.0", "@types/node": "^22", "@vitest/browser": "^3.2.3", - "bits-ui": "^2.8.11", + "bits-ui": "^2.14.4", "clsx": "^2.1.1", "dexie": "^4.0.11", "eslint": "^9.18.0", @@ -3343,17 +3343,17 @@ } }, "node_modules/bits-ui": { - "version": "2.8.11", - "resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.8.11.tgz", - "integrity": "sha512-lKN9rAk69my6j7H1D4B87r8LrHuEtfEsf1xCixBj9yViql2BdI3f04HyyyT7T1GOCpgb9+8b0B+nm3LN81Konw==", + "version": "2.14.4", + "resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.14.4.tgz", + "integrity": "sha512-W6kenhnbd/YVvur+DKkaVJ6GldE53eLewur5AhUCqslYQ0vjZr8eWlOfwZnMiPB+PF5HMVqf61vXBvmyrAmPWg==", "dev": true, "license": "MIT", "dependencies": { "@floating-ui/core": "^1.7.1", "@floating-ui/dom": "^1.7.1", "esm-env": "^1.1.2", - "runed": "^0.29.1", - "svelte-toolbelt": "^0.9.3", + "runed": "^0.35.1", + "svelte-toolbelt": "^0.10.6", "tabbable": "^6.2.0" }, "engines": { @@ -3368,9 +3368,9 @@ } }, "node_modules/bits-ui/node_modules/runed": { - "version": "0.29.2", - "resolved": "https://registry.npmjs.org/runed/-/runed-0.29.2.tgz", - "integrity": "sha512-0cq6cA6sYGZwl/FvVqjx9YN+1xEBu9sDDyuWdDW1yWX7JF2wmvmVKfH+hVCZs+csW+P3ARH92MjI3H9QTagOQA==", + "version": "0.35.1", + "resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz", + "integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==", "dev": true, "funding": [ "https://github.com/sponsors/huntabyte", @@ -3378,23 +3378,31 @@ ], "license": "MIT", "dependencies": { - "esm-env": "^1.0.0" + "dequal": "^2.0.3", + "esm-env": "^1.0.0", + "lz-string": "^1.5.0" }, "peerDependencies": { + "@sveltejs/kit": "^2.21.0", "svelte": "^5.7.0" + }, + "peerDependenciesMeta": { + "@sveltejs/kit": { + "optional": true + } } }, "node_modules/bits-ui/node_modules/svelte-toolbelt": { - "version": "0.9.3", - "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.9.3.tgz", - "integrity": "sha512-HCSWxCtVmv+c6g1ACb8LTwHVbDqLKJvHpo6J8TaqwUme2hj9ATJCpjCPNISR1OCq2Q4U1KT41if9ON0isINQZw==", + "version": "0.10.6", + "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz", + "integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==", "dev": true, "funding": [ "https://github.com/sponsors/huntabyte" ], "dependencies": { "clsx": "^2.1.1", - "runed": "^0.29.0", + "runed": "^0.35.1", "style-to-object": "^1.0.8" }, "engines": { diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json index 987a7239ed..c20ab3cfde 100644 --- a/tools/server/webui/package.json +++ b/tools/server/webui/package.json @@ -43,7 +43,7 @@ "@tailwindcss/vite": "^4.0.0", "@types/node": "^22", "@vitest/browser": "^3.2.3", - "bits-ui": "^2.8.11", + "bits-ui": "^2.14.4", "clsx": "^2.1.1", "dexie": "^4.0.11", "eslint": "^9.18.0", diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte index 7f8e38286d..78cc1c47da 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte @@ -331,6 +331,7 @@ class="{INPUT_CLASSES} border-radius-bottom-none mx-auto max-w-[48rem] overflow-hidden rounded-3xl backdrop-blur-md {disabled ? 'cursor-not-allowed opacity-60' : ''} {className}" + data-slot="chat-form" > - import { Input } from '$lib/components/ui/input'; - import { Search } from '@lucide/svelte'; + import { SearchInput } from '$lib/components/app'; interface Props { value?: string; @@ -15,19 +14,6 @@ onInput, class: className }: Props = $props(); - - function handleInput(event: Event) { - const target = event.target as HTMLInputElement; - - value = target.value; - onInput?.(target.value); - } -
- - - -
+ diff --git a/tools/server/webui/src/lib/components/app/index.ts b/tools/server/webui/src/lib/components/app/index.ts index 87b24598b7..8631d4fb3b 100644 --- a/tools/server/webui/src/lib/components/app/index.ts +++ b/tools/server/webui/src/lib/components/app/index.ts @@ -64,6 +64,7 @@ export { default as CopyToClipboardIcon } from './misc/CopyToClipboardIcon.svelt export { default as KeyboardShortcutInfo } from './misc/KeyboardShortcutInfo.svelte'; export { default as MarkdownContent } from './misc/MarkdownContent.svelte'; export { default as RemoveButton } from './misc/RemoveButton.svelte'; +export { default as SearchInput } from './misc/SearchInput.svelte'; export { default as SyntaxHighlightedCode } from './misc/SyntaxHighlightedCode.svelte'; export { default as ModelsSelector } from './models/ModelsSelector.svelte'; diff --git a/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte new file mode 100644 index 0000000000..15cd6abaa9 --- /dev/null +++ b/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte @@ -0,0 +1,73 @@ + + +
+ + + + + {#if showClearButton} + + {/if} +
diff --git a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte index c4331e92f1..ac0937696d 100644 --- a/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte +++ b/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte @@ -2,8 +2,8 @@ import { onMount, tick } from 'svelte'; import { ChevronDown, EyeOff, Loader2, MicOff, Package, Power } from '@lucide/svelte'; import * as Tooltip from '$lib/components/ui/tooltip'; + import * as Popover from '$lib/components/ui/popover'; import { cn } from '$lib/components/ui/utils'; - import { portalToBody } from '$lib/utils'; import { modelsStore, modelOptions, @@ -17,12 +17,8 @@ import { usedModalities, conversationsStore } from '$lib/stores/conversations.svelte'; import { ServerModelStatus } from '$lib/enums'; import { isRouterMode } from '$lib/stores/server.svelte'; - import { DialogModelInformation } from '$lib/components/app'; - import { - MENU_MAX_WIDTH, - MENU_OFFSET, - VIEWPORT_GUTTER - } from '$lib/constants/floating-ui-constraints'; + import { DialogModelInformation, SearchInput } from '$lib/components/app'; + import type { ModelOption } from '$lib/types/models'; interface Props { class?: string; @@ -145,185 +141,126 @@ return options.some((option) => option.model === currentModel); }); - let isOpen = $state(false); - let showModelDialog = $state(false); - let container: HTMLDivElement | null = null; - let menuRef = $state(null); - let triggerButton = $state(null); - let menuPosition = $state<{ - top: number; - left: number; - width: number; - placement: 'top' | 'bottom'; - maxHeight: number; - } | null>(null); + let searchTerm = $state(''); + let searchInputRef = $state(null); + let highlightedIndex = $state(-1); - onMount(async () => { - try { - await modelsStore.fetch(); - } catch (error) { - console.error('Unable to load models:', error); - } + let filteredOptions: ModelOption[] = $derived( + (() => { + const term = searchTerm.trim().toLowerCase(); + if (!term) return options; + + return options.filter( + (option) => + option.model.toLowerCase().includes(term) || option.name?.toLowerCase().includes(term) + ); + })() + ); + + // Get indices of compatible options for keyboard navigation + let compatibleIndices = $derived( + filteredOptions + .map((option, index) => (isModelCompatible(option) ? index : -1)) + .filter((i) => i !== -1) + ); + + // Reset highlighted index when search term changes + $effect(() => { + void searchTerm; + highlightedIndex = -1; }); - function toggleOpen() { + let isOpen = $state(false); + let showModelDialog = $state(false); + + onMount(() => { + modelsStore.fetch().catch((error) => { + console.error('Unable to load models:', error); + }); + }); + + function handleOpenChange(open: boolean) { if (loading || updating) return; - if (isRouter) { - // Router mode: show dropdown - if (isOpen) { - closeMenu(); - } else { - openMenu(); + if (open) { + isOpen = true; + searchTerm = ''; + highlightedIndex = -1; + + // Focus search input after popover opens + tick().then(() => { + requestAnimationFrame(() => searchInputRef?.focus()); + }); + + if (isRouter) { + modelsStore.fetchRouterModels().then(() => { + modelsStore.fetchModalitiesForLoadedModels(); + }); } } else { - // Single model mode: show dialog - showModelDialog = true; + isOpen = false; + searchTerm = ''; + highlightedIndex = -1; } } - async function openMenu() { + function handleTriggerClick() { if (loading || updating) return; - isOpen = true; - await tick(); - updateMenuPosition(); - requestAnimationFrame(() => updateMenuPosition()); - - if (isRouter) { - modelsStore.fetchRouterModels().then(() => { - modelsStore.fetchModalitiesForLoadedModels(); - }); + if (!isRouter) { + // Single model mode: show dialog instead of popover + showModelDialog = true; } + // For router mode, the Popover handles open/close } export function open() { if (isRouter) { - openMenu(); + handleOpenChange(true); } else { showModelDialog = true; } } function closeMenu() { - if (!isOpen) return; - - isOpen = false; - menuPosition = null; + handleOpenChange(false); } - function handlePointerDown(event: PointerEvent) { - if (!container) return; + function handleSearchKeyDown(event: KeyboardEvent) { + if (event.isComposing) return; - const target = event.target as Node | null; + if (event.key === 'ArrowDown') { + event.preventDefault(); + if (compatibleIndices.length === 0) return; - if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) { - closeMenu(); - } - } - - function handleKeydown(event: KeyboardEvent) { - if (event.key === 'Escape') { - closeMenu(); - } - } - - function handleResize() { - if (isOpen) { - updateMenuPosition(); - } - } - - function updateMenuPosition() { - if (!isOpen || !triggerButton || !menuRef) return; - - const triggerRect = triggerButton.getBoundingClientRect(); - const viewportWidth = window.innerWidth; - const viewportHeight = window.innerHeight; - - if (viewportWidth === 0 || viewportHeight === 0) return; - - const scrollWidth = menuRef.scrollWidth; - const scrollHeight = menuRef.scrollHeight; - - const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2); - const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH); - const safeMaxWidth = - constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth); - const desiredMinWidth = Math.min(160, safeMaxWidth || 160); - - let width = Math.min( - Math.max(triggerRect.width, scrollWidth, desiredMinWidth), - safeMaxWidth || 320 - ); - - const availableBelow = Math.max( - 0, - viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET - ); - const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET); - const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2); - const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight); - - function computePlacement(placement: 'top' | 'bottom') { - const available = placement === 'bottom' ? availableBelow : availableAbove; - const allowedHeight = - available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance; - const maxHeight = Math.min(scrollHeight, allowedHeight); - const height = Math.max(0, maxHeight); - - let top: number; - if (placement === 'bottom') { - const rawTop = triggerRect.bottom + MENU_OFFSET; - const minTop = VIEWPORT_GUTTER; - const maxTop = viewportHeight - VIEWPORT_GUTTER - height; - if (maxTop < minTop) { - top = minTop; - } else { - top = Math.min(Math.max(rawTop, minTop), maxTop); - } + const currentPos = compatibleIndices.indexOf(highlightedIndex); + if (currentPos === -1 || currentPos === compatibleIndices.length - 1) { + highlightedIndex = compatibleIndices[0]; } else { - const rawTop = triggerRect.top - MENU_OFFSET - height; - const minTop = VIEWPORT_GUTTER; - const maxTop = viewportHeight - VIEWPORT_GUTTER - height; - if (maxTop < minTop) { - top = minTop; - } else { - top = Math.max(Math.min(rawTop, maxTop), minTop); + highlightedIndex = compatibleIndices[currentPos + 1]; + } + } else if (event.key === 'ArrowUp') { + event.preventDefault(); + if (compatibleIndices.length === 0) return; + + const currentPos = compatibleIndices.indexOf(highlightedIndex); + if (currentPos === -1 || currentPos === 0) { + highlightedIndex = compatibleIndices[compatibleIndices.length - 1]; + } else { + highlightedIndex = compatibleIndices[currentPos - 1]; + } + } else if (event.key === 'Enter') { + event.preventDefault(); + if (highlightedIndex >= 0 && highlightedIndex < filteredOptions.length) { + const option = filteredOptions[highlightedIndex]; + if (isModelCompatible(option)) { + handleSelect(option.id); } - } - - return { placement, top, height, maxHeight }; - } - - const belowMetrics = computePlacement('bottom'); - const aboveMetrics = computePlacement('top'); - - let metrics = belowMetrics; - if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) { - metrics = aboveMetrics; - } - - let left = triggerRect.right - width; - const maxLeft = viewportWidth - VIEWPORT_GUTTER - width; - if (maxLeft < VIEWPORT_GUTTER) { - left = VIEWPORT_GUTTER; - } else { - if (left > maxLeft) { - left = maxLeft; - } - if (left < VIEWPORT_GUTTER) { - left = VIEWPORT_GUTTER; + } else if (compatibleIndices.length > 0) { + // No selection - highlight first compatible option + highlightedIndex = compatibleIndices[0]; } } - - menuPosition = { - top: Math.round(metrics.top), - left: Math.round(left), - width: Math.round(width), - placement: metrics.placement, - maxHeight: Math.round(metrics.maxHeight) - }; } async function handleSelect(modelId: string) { @@ -356,6 +293,14 @@ if (shouldCloseMenu) { closeMenu(); + + // Focus the chat textarea after model selection + requestAnimationFrame(() => { + const textarea = document.querySelector( + '[data-slot="chat-form"] textarea' + ); + textarea?.focus(); + }); } } @@ -404,10 +349,7 @@ } - - - -
+
{#if loading && options.length === 0 && isRouter}
@@ -418,9 +360,8 @@ {:else} {@const selectedOption = getDisplayOption()} -
- + - {#if isOpen && isRouter} -
+ +
0 - ? `${menuPosition.maxHeight}px` - : undefined} + class="order-1 shrink-0 border-b p-4 group-data-[side=top]/popover-content:order-2 group-data-[side=top]/popover-content:border-t group-data-[side=top]/popover-content:border-b-0" + > + +
+
{#if !isCurrentModelInCache() && currentModel}
- {/if} -
+
+ {/if}
diff --git a/tools/server/webui/src/lib/components/ui/popover/index.ts b/tools/server/webui/src/lib/components/ui/popover/index.ts new file mode 100644 index 0000000000..c5937fb3a0 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/index.ts @@ -0,0 +1,19 @@ +import Root from './popover.svelte'; +import Close from './popover-close.svelte'; +import Content from './popover-content.svelte'; +import Trigger from './popover-trigger.svelte'; +import Portal from './popover-portal.svelte'; + +export { + Root, + Content, + Trigger, + Close, + Portal, + // + Root as Popover, + Content as PopoverContent, + Trigger as PopoverTrigger, + Close as PopoverClose, + Portal as PopoverPortal +}; diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte new file mode 100644 index 0000000000..dc4dec4b33 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte @@ -0,0 +1,7 @@ + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte new file mode 100644 index 0000000000..2d3513d347 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte @@ -0,0 +1,37 @@ + + + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte new file mode 100644 index 0000000000..25efb877b7 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte @@ -0,0 +1,7 @@ + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte new file mode 100644 index 0000000000..5ef3d0e932 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte @@ -0,0 +1,17 @@ + + + diff --git a/tools/server/webui/src/lib/components/ui/popover/popover.svelte b/tools/server/webui/src/lib/components/ui/popover/popover.svelte new file mode 100644 index 0000000000..f39b867a69 --- /dev/null +++ b/tools/server/webui/src/lib/components/ui/popover/popover.svelte @@ -0,0 +1,7 @@ + + + diff --git a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts index c95d3f1841..003fc77acb 100644 --- a/tools/server/webui/src/lib/constants/floating-ui-constraints.ts +++ b/tools/server/webui/src/lib/constants/floating-ui-constraints.ts @@ -1,3 +1,2 @@ export const VIEWPORT_GUTTER = 8; export const MENU_OFFSET = 6; -export const MENU_MAX_WIDTH = 320; diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts index 29416c2fe5..34b26403e4 100644 --- a/tools/server/webui/src/lib/stores/models.svelte.ts +++ b/tools/server/webui/src/lib/stores/models.svelte.ts @@ -295,14 +295,21 @@ class ModelsStore { * Fetch props for a specific model from /props endpoint * Uses caching to avoid redundant requests * + * In ROUTER mode, this will only fetch props if the model is loaded, + * since unloaded models return 400 from /props endpoint. + * * @param modelId - Model identifier to fetch props for - * @returns Props data or null if fetch failed + * @returns Props data or null if fetch failed or model not loaded */ async fetchModelProps(modelId: string): Promise { // Return cached props if available const cached = this.modelPropsCache.get(modelId); if (cached) return cached; + if (serverStore.isRouterMode && !this.isModelLoaded(modelId)) { + return null; + } + // Avoid duplicate fetches if (this.modelPropsFetching.has(modelId)) return null; diff --git a/tools/server/webui/src/lib/utils/latex-protection.test.ts b/tools/server/webui/src/lib/utils/latex-protection.test.ts index 2354f8fa0e..40fe1b0db2 100644 --- a/tools/server/webui/src/lib/utils/latex-protection.test.ts +++ b/tools/server/webui/src/lib/utils/latex-protection.test.ts @@ -303,6 +303,27 @@ $$\n\\pi_n(\\mathbb{S}^3) = \\begin{cases} expect(output).toBe(input); // Code blocks prevent misinterpretation }); + test('preserves backslash parentheses in code blocks (GitHub issue)', () => { + const input = '```python\nfoo = "\\(bar\\)"\n```'; + const output = preprocessLaTeX(input); + + expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied + }); + + test('preserves backslash brackets in code blocks', () => { + const input = '```python\nfoo = "\\[bar\\]"\n```'; + const output = preprocessLaTeX(input); + + expect(output).toBe(input); // Code blocks should not have LaTeX conversion applied + }); + + test('preserves backslash parentheses in inline code', () => { + const input = 'Use `foo = "\\(bar\\)"` in your code.'; + const output = preprocessLaTeX(input); + + expect(output).toBe(input); + }); + test('escape backslash in mchem ce', () => { const input = 'mchem ce:\n$\\ce{2H2(g) + O2(g) -> 2H2O(l)}$'; const output = preprocessLaTeX(input); diff --git a/tools/server/webui/src/lib/utils/latex-protection.ts b/tools/server/webui/src/lib/utils/latex-protection.ts index 7f5cf2cddf..cafa2d4761 100644 --- a/tools/server/webui/src/lib/utils/latex-protection.ts +++ b/tools/server/webui/src/lib/utils/latex-protection.ts @@ -226,19 +226,16 @@ export function preprocessLaTeX(content: string): string { return expr; }); - // Step 5: Restore code blocks - content = content.replace(/<>/g, (_, index) => { - return codeBlocks[parseInt(index)]; - }); - - // Step 6: Apply additional escaping functions (brackets and mhchem) + // Step 5: Apply additional escaping functions (brackets and mhchem) + // This must happen BEFORE restoring code blocks to avoid affecting code content content = escapeBrackets(content); if (doEscapeMhchem && (content.includes('\\ce{') || content.includes('\\pu{'))) { content = escapeMhchem(content); } - // Final pass: Convert \(...\) → $...$, \[...\] → $$...$$ + // Step 6: Convert remaining \(...\) → $...$, \[...\] → $$...$$ + // This must happen BEFORE restoring code blocks to avoid affecting code content content = content // Using the look‑behind pattern `(? { - return `${prefix}$$${content}$$`; + (_, content: string) => { + return `$$${content}$$`; } ); - // Step 7: Restore blockquote markers + // Step 7: Restore code blocks + // This happens AFTER all LaTeX conversions to preserve code content + content = content.replace(/<>/g, (_, index) => { + return codeBlocks[parseInt(index)]; + }); + + // Step 8: Restore blockquote markers if (blockquoteMarkers.size > 0) { const finalLines = content.split('\n'); const restoredLines = finalLines.map((line, index) => { diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt index 369502d7ae..e90e8e2d1b 100644 --- a/vendor/cpp-httplib/CMakeLists.txt +++ b/vendor/cpp-httplib/CMakeLists.txt @@ -9,6 +9,10 @@ if (NOT MSVC) endif() target_link_libraries (${TARGET} PRIVATE Threads::Threads) + +if (WIN32 AND NOT MSVC) + target_link_libraries(${TARGET} PUBLIC ws2_32) +endif() target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_definitions(${TARGET} PRIVATE