Merge branch 'master' into compilade/mamba2
This commit is contained in:
commit
a42f239418
|
|
@ -49,19 +49,23 @@ COPY --from=build /app/full /app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update && \
|
||||||
&& apt-get install -y \
|
apt-get install -y \
|
||||||
git \
|
git \
|
||||||
python3 \
|
python3 \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
&& pip install --upgrade pip setuptools wheel \
|
python3-venv && \
|
||||||
&& pip install -r requirements.txt \
|
python3 -m venv /opt/venv && \
|
||||||
&& apt autoremove -y \
|
. /opt/venv/bin/activate && \
|
||||||
&& apt clean -y \
|
pip install --upgrade pip setuptools wheel && \
|
||||||
&& rm -rf /tmp/* /var/tmp/* \
|
pip install -r requirements.txt && \
|
||||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
apt autoremove -y && \
|
||||||
&& find /var/cache -type f -delete
|
apt clean -y && \
|
||||||
|
rm -rf /tmp/* /var/tmp/* && \
|
||||||
|
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||||
|
find /var/cache -type f -delete
|
||||||
|
|
||||||
|
ENV PATH="/opt/venv/bin:$PATH"
|
||||||
|
|
||||||
ENTRYPOINT ["/app/tools.sh"]
|
ENTRYPOINT ["/app/tools.sh"]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -693,7 +693,7 @@ jobs:
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||||
- build: 'llvm-arm64-opencl-adreno'
|
- build: 'llvm-arm64-opencl-adreno'
|
||||||
|
|
@ -778,6 +778,7 @@ jobs:
|
||||||
cmake -S . -B build ${{ matrix.defines }} `
|
cmake -S . -B build ${{ matrix.defines }} `
|
||||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
|
||||||
|
|
||||||
- name: Add libopenblas.dll
|
- name: Add libopenblas.dll
|
||||||
id: add_libopenblas_dll
|
id: add_libopenblas_dll
|
||||||
|
|
|
||||||
|
|
@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||||
|
|
||||||
|
if (NOT DEFINED LLAMA_BUILD_NUMBER)
|
||||||
|
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||||
|
endif()
|
||||||
|
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
||||||
|
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||||
|
endif()
|
||||||
|
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||||
|
|
||||||
# override ggml options
|
# override ggml options
|
||||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||||
|
|
@ -155,6 +163,8 @@ if (LLAMA_USE_SYSTEM_GGML)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||||
|
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
|
||||||
|
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
|
||||||
add_subdirectory(ggml)
|
add_subdirectory(ggml)
|
||||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||||
endif()
|
endif()
|
||||||
|
|
@ -204,10 +214,6 @@ endif()
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
include(CMakePackageConfigHelpers)
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
|
||||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
|
||||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
|
||||||
|
|
||||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
|
@ -18,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
|
||||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ sd=`dirname $0`
|
||||||
cd $sd/../
|
cd $sd/../
|
||||||
SRC=`pwd`
|
SRC=`pwd`
|
||||||
|
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,8 @@ llama_add_compile_flags()
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
|
|
||||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
||||||
|
|
||||||
# Is git submodule
|
# Is git submodule
|
||||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||||
|
|
@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
if (SLASH_POS EQUAL 0)
|
if (SLASH_POS EQUAL 0)
|
||||||
set(GIT_DIR "${REAL_GIT_DIR}")
|
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||||
else()
|
else()
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(EXISTS "${GIT_DIR}/index")
|
if(EXISTS "${GIT_DIR}/index")
|
||||||
set(GIT_INDEX "${GIT_DIR}/index")
|
# For build-info.cpp below
|
||||||
|
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
|
||||||
else()
|
else()
|
||||||
message(WARNING "Git index not found in git repository.")
|
message(WARNING "Git index not found in git repository.")
|
||||||
set(GIT_INDEX "")
|
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||||
set(GIT_INDEX "")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||||
add_custom_command(
|
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||||
COMMENT "Generating build details from Git"
|
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
|
||||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
|
||||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
|
||||||
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
|
||||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
|
||||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
|
||||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
set(TARGET build_info)
|
set(TARGET build_info)
|
||||||
add_library(${TARGET} OBJECT build-info.cpp)
|
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.reranking && params.embedding) {
|
|
||||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||||
throw std::runtime_error(string_format(
|
throw std::runtime_error(string_format(
|
||||||
"error: the supplied chat template is not supported: %s%s\n",
|
"error: the supplied chat template is not supported: %s%s\n",
|
||||||
|
|
@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reranking", "--rerank"},
|
{"--reranking", "--rerank"},
|
||||||
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.reranking = true;
|
params.embedding = true;
|
||||||
|
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
||||||
|
|
||||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||||
result_.tool_calls.emplace_back(tool_call);
|
result_.tool_calls.emplace_back(tool_call);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||||
|
|
@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
||||||
/* .is_partial = */ found_healing_marker,
|
/* .is_partial = */ found_healing_marker,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void common_chat_msg_parser::clear_tools() {
|
||||||
|
result_.tool_calls.clear();
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -115,4 +115,6 @@ class common_chat_msg_parser {
|
||||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
void clear_tools();
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -1838,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
||||||
if (res < 0) {
|
if (res < 0) {
|
||||||
// if the custom "tmpl" is not supported, we throw an error
|
// if the custom "tmpl" is not supported, we throw an error
|
||||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||||
throw std::runtime_error("this custom template is not supported");
|
throw std::runtime_error("this custom template is not supported, try using --jinja");
|
||||||
}
|
}
|
||||||
|
|
||||||
// if it turns out that our buffer is too small, we resize it
|
// if it turns out that our buffer is too small, we resize it
|
||||||
|
|
@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||||
} catch (const common_chat_msg_partial_exception & ex) {
|
} catch (const common_chat_msg_partial_exception & ex) {
|
||||||
LOG_DBG("Partial parse: %s\n", ex.what());
|
LOG_DBG("Partial parse: %s\n", ex.what());
|
||||||
if (!is_partial) {
|
if (!is_partial) {
|
||||||
throw std::runtime_error(ex.what());
|
builder.clear_tools();
|
||||||
|
builder.move_to(0);
|
||||||
|
common_chat_parse_content_only(builder);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto msg = builder.result();
|
auto msg = builder.result();
|
||||||
|
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
|
||||||
|
|
||||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
|
||||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
|
||||||
|
|
||||||
# Only write the build info if it changed
|
|
||||||
if(EXISTS ${OUTPUT_FILE})
|
|
||||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
|
||||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
|
||||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
|
||||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
|
||||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
|
||||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
|
||||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
|
||||||
if (
|
|
||||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
|
||||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
|
||||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
|
||||||
)
|
|
||||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
|
||||||
endif()
|
|
||||||
|
|
@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
||||||
|
|
||||||
std::string regex_escape(const std::string & s) {
|
std::string regex_escape(const std::string & s) {
|
||||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||||
return std::regex_replace(s, special_chars, "\\$0");
|
return std::regex_replace(s, special_chars, "\\$&");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||||
|
|
@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
// returns true if successful, false otherwise
|
// returns true if successful, false otherwise
|
||||||
bool fs_create_directory_with_parents(const std::string & path) {
|
bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
|
@ -784,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
// process path from front to back, procedurally creating directories
|
// process path from front to back, procedurally creating directories
|
||||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||||
const wchar_t * test = subpath.c_str();
|
|
||||||
|
|
||||||
const bool success = CreateDirectoryW(test, NULL);
|
pos_slash += 1;
|
||||||
|
|
||||||
|
// skip the drive letter, in some systems it can return an access denied error
|
||||||
|
if (subpath.length() == 2 && subpath[1] == ':') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
const DWORD error = GetLastError();
|
const DWORD error = GetLastError();
|
||||||
|
|
||||||
|
|
@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pos_slash += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -897,34 +905,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
if (params.reranking) {
|
|
||||||
bool ok = true;
|
|
||||||
|
|
||||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
|
||||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
|
||||||
|
|
||||||
if (!has_eos && !has_sep) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
|
||||||
ok = false;
|
|
||||||
} else if (!has_eos) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
|
||||||
} else if (!has_sep) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
||||||
ok = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!ok) {
|
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
|
|
@ -966,6 +946,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
||||||
|
bool ok = true;
|
||||||
|
|
||||||
|
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||||
|
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
|
if (!has_eos && !has_sep) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
} else if (!has_eos) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||||
|
} else if (!has_sep) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ok) {
|
||||||
|
llama_free(lctx);
|
||||||
|
llama_model_free(model);
|
||||||
|
|
||||||
|
return iparams;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// load and optionally apply lora adapters
|
// load and optionally apply lora adapters
|
||||||
for (auto & la : params.lora_adapters) {
|
for (auto & la : params.lora_adapters) {
|
||||||
llama_adapter_lora_ptr lora;
|
llama_adapter_lora_ptr lora;
|
||||||
|
|
@ -1143,11 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
cparams.op_offload = !params.no_op_offload;
|
cparams.op_offload = !params.no_op_offload;
|
||||||
cparams.swa_full = params.swa_full;
|
cparams.swa_full = params.swa_full;
|
||||||
|
|
||||||
if (params.reranking) {
|
|
||||||
cparams.embeddings = true;
|
|
||||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
||||||
}
|
|
||||||
|
|
||||||
cparams.type_k = params.cache_type_k;
|
cparams.type_k = params.cache_type_k;
|
||||||
cparams.type_v = params.cache_type_v;
|
cparams.type_v = params.cache_type_v;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -355,7 +355,6 @@ struct common_params {
|
||||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
std::string embd_sep = "\n"; // separator of embeddings
|
std::string embd_sep = "\n"; // separator of embeddings
|
||||||
bool reranking = false; // enable reranking support on server
|
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
|
|
||||||
|
|
@ -519,7 +519,7 @@ class TextModel(ModelBase):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
|
||||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_context_length(n_ctx)
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
logger.info(f"gguf: context length = {n_ctx}")
|
logger.info(f"gguf: context length = {n_ctx}")
|
||||||
|
|
||||||
|
|
@ -556,11 +556,8 @@ class TextModel(ModelBase):
|
||||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||||
|
|
||||||
if (head_dim := self.hparams.get("head_dim")) is not None:
|
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||||
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
|
self.gguf_writer.add_key_length(head_dim)
|
||||||
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
|
self.gguf_writer.add_value_length(head_dim)
|
||||||
if self.hparams.get("model_type") != "deepseek_v3":
|
|
||||||
self.gguf_writer.add_key_length(head_dim)
|
|
||||||
self.gguf_writer.add_value_length(head_dim)
|
|
||||||
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
logger.info(f"gguf: file type = {self.ftype}")
|
logger.info(f"gguf: file type = {self.ftype}")
|
||||||
|
|
@ -1901,9 +1898,7 @@ class LlamaModel(TextModel):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
|
||||||
if "head_dim" in hparams:
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
rope_dim = hparams["head_dim"]
|
|
||||||
else:
|
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
|
@ -1985,7 +1980,8 @@ class LlamaModel(TextModel):
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
|
@ -2020,6 +2016,20 @@ class LlamaModel(TextModel):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("ArceeForCausalLM")
|
||||||
|
class ArceeModel(LlamaModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.ARCEE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self._try_set_pooling_type()
|
||||||
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||||
|
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register(
|
@ModelBase.register(
|
||||||
"LlavaForConditionalGeneration", # pixtral
|
"LlavaForConditionalGeneration", # pixtral
|
||||||
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
||||||
|
|
@ -2307,9 +2317,7 @@ class DeciModel(TextModel):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
|
||||||
if "head_dim" in hparams:
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
rope_dim = hparams["head_dim"]
|
|
||||||
else:
|
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
|
@ -2349,7 +2357,8 @@ class DeciModel(TextModel):
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
|
@ -3667,9 +3676,7 @@ class InternLM3Model(TextModel):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
|
||||||
if "head_dim" in hparams:
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
rope_dim = hparams["head_dim"]
|
|
||||||
else:
|
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
|
|
@ -4062,6 +4069,34 @@ class NomicBertModel(BertModel):
|
||||||
raise ValueError(f"unknown tokenizer: {toktyp}")
|
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
||||||
|
class NeoBert(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
|
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
||||||
|
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
||||||
|
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
|
||||||
|
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||||
|
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch, name, bid):
|
||||||
|
if name.startswith("decoder."):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name.startswith("model."):
|
||||||
|
name = name[6:]
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||||
class XLMRobertaModel(BertModel):
|
class XLMRobertaModel(BertModel):
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
@ -5158,9 +5193,7 @@ class DeepseekModel(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
if "head_dim" in hparams:
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
rope_dim = hparams["head_dim"]
|
|
||||||
else:
|
|
||||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
@ -5364,6 +5397,34 @@ class DeepseekV2Model(TextModel):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@ModelBase.register("Dots1ForCausalLM")
|
||||||
|
class Dots1Model(Qwen2MoeModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DOTS1
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||||
|
|
||||||
|
if self.hparams["scoring_func"] == "noaux_tc":
|
||||||
|
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
|
if name.endswith("e_score_correction_bias"):
|
||||||
|
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||||
|
if "shared_experts" in name:
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("PLMForCausalLM")
|
@ModelBase.register("PLMForCausalLM")
|
||||||
class PLMModel(TextModel):
|
class PLMModel(TextModel):
|
||||||
model_arch = gguf.MODEL_ARCH.PLM
|
model_arch = gguf.MODEL_ARCH.PLM
|
||||||
|
|
@ -6022,7 +6083,8 @@ class ExaoneModel(TextModel):
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
if (dim := self.hparams.get("head_dim")) is None:
|
||||||
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
|
@ -6134,7 +6196,8 @@ class BailingMoeModel(TextModel):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
|
if (rope_dim := hparams.get("head_dim")) is None:
|
||||||
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||||
|
|
@ -6166,7 +6229,8 @@ class BailingMoeModel(TextModel):
|
||||||
n_head = self.hparams["num_attention_heads"]
|
n_head = self.hparams["num_attention_heads"]
|
||||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
n_embd = self.hparams["hidden_size"]
|
n_embd = self.hparams["hidden_size"]
|
||||||
head_dim = self.hparams.get("head_dim") or n_embd // n_head
|
if (head_dim := self.hparams.get("head_dim")) is None:
|
||||||
|
head_dim = n_embd // n_head
|
||||||
|
|
||||||
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,157 @@
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
|
||||||
|
|
||||||
|
# Build llama.cpp locally (for s390x)
|
||||||
|
|
||||||
|
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
|
||||||
|
|
||||||
|
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
|
||||||
|
|
||||||
|
**To get the code:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggml-org/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
## CPU Build with BLAS
|
||||||
|
|
||||||
|
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_BLAS=ON \
|
||||||
|
-DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notes**:
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/)
|
||||||
|
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_BLAS=ON \
|
||||||
|
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||||
|
-DGGML_VXE=OFF
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
- For debug builds:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=Debug \
|
||||||
|
-DGGML_BLAS=ON \
|
||||||
|
-DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
|
|
||||||
|
cmake --build build --config Debug -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S . -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_BLAS=ON \
|
||||||
|
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||||
|
-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Getting GGUF Models
|
||||||
|
|
||||||
|
All models need to be converted to Big-Endian. You can achieve this in three cases:
|
||||||
|
|
||||||
|
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
|
||||||
|
|
||||||
|
You can find popular models pre-converted and verified at [s390x Ready Models](hf.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
|
||||||
|
|
||||||
|
These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE.
|
||||||
|
|
||||||
|
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 convert_hf_to_gguf.py \
|
||||||
|
--outfile model-name-be.f16.gguf \
|
||||||
|
--outtype f16 \
|
||||||
|
--bigendian \
|
||||||
|
model-directory/
|
||||||
|
```
|
||||||
|
|
||||||
|
For example,
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 convert_hf_to_gguf.py \
|
||||||
|
--outfile granite-3.3-2b-instruct-be.f16.gguf \
|
||||||
|
--outtype f16 \
|
||||||
|
--bigendian \
|
||||||
|
granite-3.3-2b-instruct/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Convert existing GGUF Little-Endian model to Big-Endian**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
|
||||||
|
```
|
||||||
|
|
||||||
|
For example,
|
||||||
|
```bash
|
||||||
|
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
|
||||||
|
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notes:**
|
||||||
|
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
|
||||||
|
|
||||||
|
## IBM Accelerators
|
||||||
|
|
||||||
|
### 1. SIMD Acceleration
|
||||||
|
|
||||||
|
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||||
|
|
||||||
|
### 2. zDNN Accelerator
|
||||||
|
|
||||||
|
*Only available in IBM z16 or later system. No direction at the moment.*
|
||||||
|
|
||||||
|
### 3. Spyre Accelerator
|
||||||
|
|
||||||
|
*No direction at the moment.*
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### 1. Virtualization Setup
|
||||||
|
|
||||||
|
It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
|
||||||
|
|
||||||
|
Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
|
||||||
|
|
||||||
|
### 2. IFL (Core) Count
|
||||||
|
|
||||||
|
It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
|
||||||
|
|
||||||
|
Note: IFL count does not equate to vCPU count.
|
||||||
|
|
||||||
|
### 3. SMT vs NOSMT (Simultaneous Multithreading)
|
||||||
|
|
||||||
|
It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
|
||||||
|
|
||||||
|
### 4. BLAS vs NOBLAS
|
||||||
|
|
||||||
|
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
|
||||||
|
|
||||||
|
## Getting Help on IBM Z & LinuxONE
|
||||||
|
|
||||||
|
1. **Bugs, Feature Requests**
|
||||||
|
|
||||||
|
Please file an issue in llama.cpp and ensure that the title contains "s390x".
|
||||||
|
|
||||||
|
2. **Other Questions**
|
||||||
|
|
||||||
|
Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
|
||||||
|
|
||||||
|
|
@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
|
||||||
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
|
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
|
||||||
- Functionary v3.1 / v3.2
|
- Functionary v3.1 / v3.2
|
||||||
- Hermes 2/3, Qwen 2.5
|
- Hermes 2/3, Qwen 2.5
|
||||||
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
|
- Qwen 2.5 Coder
|
||||||
- Mistral Nemo
|
- Mistral Nemo
|
||||||
- Firefunction v2
|
- Firefunction v2
|
||||||
- Command R7B
|
- Command R7B
|
||||||
|
|
|
||||||
|
|
@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Finding more models:
|
||||||
|
|
||||||
|
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf
|
||||||
|
|
|
||||||
|
|
@ -41,12 +41,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
|
|
||||||
// add input to batch (this increments n_tokens)
|
// add input to batch (this increments n_tokens)
|
||||||
for (int32_t j = 0; j < n_toks; j++) {
|
for (int32_t j = 0; j < n_toks; j++) {
|
||||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
common_batch_add(batch, inputs[j], j, { 0 }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_memory_clear(llama_get_memory(ctx), true);
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
llama_set_embeddings(ctx, true);
|
|
||||||
llama_set_causal_attn(ctx, false);
|
llama_set_causal_attn(ctx, false);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
|
|
@ -103,7 +102,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||||
llama_token eos_token = llama_vocab_eos(vocab);
|
llama_token eos_token = llama_vocab_eos(vocab);
|
||||||
|
|
||||||
llama_memory_clear(llama_get_memory(ctx), true);
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
llama_set_embeddings(ctx, false);
|
|
||||||
llama_set_causal_attn(ctx, true);
|
llama_set_causal_attn(ctx, true);
|
||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
@ -166,6 +164,8 @@ int main(int argc, char * argv[]) {
|
||||||
llama_model_params mparams = common_model_params_to_llama(params);
|
llama_model_params mparams = common_model_params_to_llama(params);
|
||||||
llama_context_params cparams = common_context_params_to_llama(params);
|
llama_context_params cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
cparams.embeddings = true;
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
|
|
@ -213,6 +213,8 @@ int main(int argc, char * argv[]) {
|
||||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_set_embeddings(ctx, false);
|
||||||
|
|
||||||
// ### Generation ###
|
// ### Generation ###
|
||||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
|
||||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||||
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
|
|
@ -367,6 +368,8 @@ if (MSVC)
|
||||||
/wd4005 # Macro redefinition
|
/wd4005 # Macro redefinition
|
||||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||||
|
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
|
||||||
|
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
|
||||||
/wd4996 # Disable POSIX deprecation warnings
|
/wd4996 # Disable POSIX deprecation warnings
|
||||||
/wd4702 # Unreachable code warnings
|
/wd4702 # Unreachable code warnings
|
||||||
)
|
)
|
||||||
|
|
@ -386,4 +389,46 @@ if (MSVC)
|
||||||
disable_msvc_warnings(ggml-cpu-skylakex)
|
disable_msvc_warnings(ggml-cpu-skylakex)
|
||||||
disable_msvc_warnings(ggml-cpu-icelake)
|
disable_msvc_warnings(ggml-cpu-icelake)
|
||||||
disable_msvc_warnings(ggml-cpu-alderlake)
|
disable_msvc_warnings(ggml-cpu-alderlake)
|
||||||
|
|
||||||
|
if (GGML_BUILD_EXAMPLES)
|
||||||
|
disable_msvc_warnings(common-ggml)
|
||||||
|
disable_msvc_warnings(common)
|
||||||
|
|
||||||
|
disable_msvc_warnings(mnist-common)
|
||||||
|
disable_msvc_warnings(mnist-eval)
|
||||||
|
disable_msvc_warnings(mnist-train)
|
||||||
|
|
||||||
|
disable_msvc_warnings(gpt-2-ctx)
|
||||||
|
disable_msvc_warnings(gpt-2-alloc)
|
||||||
|
disable_msvc_warnings(gpt-2-backend)
|
||||||
|
disable_msvc_warnings(gpt-2-sched)
|
||||||
|
disable_msvc_warnings(gpt-2-quantize)
|
||||||
|
disable_msvc_warnings(gpt-2-batched)
|
||||||
|
|
||||||
|
disable_msvc_warnings(gpt-j)
|
||||||
|
disable_msvc_warnings(gpt-j-quantize)
|
||||||
|
|
||||||
|
disable_msvc_warnings(magika)
|
||||||
|
disable_msvc_warnings(yolov3-tiny)
|
||||||
|
disable_msvc_warnings(sam)
|
||||||
|
|
||||||
|
disable_msvc_warnings(simple-ctx)
|
||||||
|
disable_msvc_warnings(simple-backend)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_BUILD_TESTS)
|
||||||
|
disable_msvc_warnings(test-mul-mat)
|
||||||
|
disable_msvc_warnings(test-arange)
|
||||||
|
disable_msvc_warnings(test-backend-ops)
|
||||||
|
disable_msvc_warnings(test-cont)
|
||||||
|
disable_msvc_warnings(test-conv-transpose)
|
||||||
|
disable_msvc_warnings(test-conv-transpose-1d)
|
||||||
|
disable_msvc_warnings(test-conv1d)
|
||||||
|
disable_msvc_warnings(test-conv2d)
|
||||||
|
disable_msvc_warnings(test-conv2d-dw)
|
||||||
|
disable_msvc_warnings(test-customop)
|
||||||
|
disable_msvc_warnings(test-dup)
|
||||||
|
disable_msvc_warnings(test-opt)
|
||||||
|
disable_msvc_warnings(test-pool)
|
||||||
|
endif ()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
||||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||||
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
||||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
||||||
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
|
||||||
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
||||||
|
|
|
||||||
|
|
@ -270,17 +270,23 @@ endfunction()
|
||||||
function(ggml_add_cpu_backend_variant tag_name)
|
function(ggml_add_cpu_backend_variant tag_name)
|
||||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||||
# other: OPENMP LLAMAFILE CPU_HBM
|
# other: OPENMP LLAMAFILE CPU_HBM
|
||||||
foreach (feat NATIVE
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||||
SSE42
|
foreach (feat NATIVE
|
||||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
SSE42
|
||||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||||
AMX_TILE AMX_INT8 AMX_BF16)
|
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||||
set(GGML_${feat} OFF)
|
AMX_TILE AMX_INT8 AMX_BF16)
|
||||||
endforeach()
|
set(GGML_${feat} OFF)
|
||||||
|
endforeach()
|
||||||
|
|
||||||
foreach (feat ${ARGN})
|
foreach (feat ${ARGN})
|
||||||
set(GGML_${feat} ON)
|
set(GGML_${feat} ON)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||||
|
foreach (feat ${ARGN})
|
||||||
|
set(GGML_INTERNAL_${feat} ON)
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
@ -290,6 +296,8 @@ ggml_add_backend(CPU)
|
||||||
if (GGML_CPU_ALL_VARIANTS)
|
if (GGML_CPU_ALL_VARIANTS)
|
||||||
if (NOT GGML_BACKEND_DL)
|
if (NOT GGML_BACKEND_DL)
|
||||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||||
|
elseif (GGML_CPU_ARM_ARCH)
|
||||||
|
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||||
endif()
|
endif()
|
||||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||||
ggml_add_cpu_backend_variant(x64)
|
ggml_add_cpu_backend_variant(x64)
|
||||||
|
|
@ -303,8 +311,34 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||||
# MSVC doesn't support AMX
|
# MSVC doesn't support AMX
|
||||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||||
endif()
|
endif()
|
||||||
|
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||||
|
# Many of these features are optional so we build versions with popular
|
||||||
|
# combinations and name the backends based on the version they were
|
||||||
|
# first released with
|
||||||
|
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||||
|
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||||
|
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||||
|
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||||
|
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||||
|
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||||
|
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||||
|
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||||
|
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||||
|
# Android-specific backends with SoC-compatible feature sets
|
||||||
|
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
||||||
|
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
||||||
|
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||||
|
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||||
|
elseif (APPLE)
|
||||||
|
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
||||||
|
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
||||||
|
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||||
endif()
|
endif()
|
||||||
elseif (GGML_CPU)
|
elseif (GGML_CPU)
|
||||||
ggml_add_cpu_backend_variant_impl("")
|
ggml_add_cpu_backend_variant_impl("")
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,17 @@
|
||||||
|
function(ggml_add_cpu_backend_features cpu_name arch)
|
||||||
|
# The feature detection code is compiled as a separate target so that
|
||||||
|
# it can be built without the architecture flags
|
||||||
|
# Since multiple variants of the CPU backend may be included in the same
|
||||||
|
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||||
|
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||||
|
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||||
|
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||||
|
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||||
|
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||||
|
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||||
|
endfunction()
|
||||||
|
|
||||||
function(ggml_add_cpu_backend_variant_impl tag_name)
|
function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
if (tag_name)
|
if (tag_name)
|
||||||
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
||||||
|
|
@ -143,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
else()
|
else()
|
||||||
if (GGML_CPU_ARM_ARCH)
|
if (GGML_CPU_ARM_ARCH)
|
||||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||||
|
elseif(GGML_CPU_ALL_VARIANTS)
|
||||||
|
# Begin with the lowest baseline
|
||||||
|
set(ARM_MCPU "armv8-a")
|
||||||
|
set(ARCH_TAGS "")
|
||||||
|
set(ARCH_DEFINITIONS "")
|
||||||
|
|
||||||
|
# When a feature is selected, bump the MCPU to the first
|
||||||
|
# version that supported it
|
||||||
|
if (GGML_INTERNAL_DOTPROD)
|
||||||
|
set(ARM_MCPU "armv8.2-a")
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||||
|
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||||
|
endif()
|
||||||
|
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||||
|
set(ARM_MCPU "armv8.2-a")
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||||
|
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||||
|
endif()
|
||||||
|
if (GGML_INTERNAL_SVE)
|
||||||
|
set(ARM_MCPU "armv8.2-a")
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||||
|
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||||
|
endif()
|
||||||
|
if (GGML_INTERNAL_MATMUL_INT8)
|
||||||
|
set(ARM_MCPU "armv8.6-a")
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||||
|
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||||
|
endif()
|
||||||
|
if (GGML_INTERNAL_SVE2)
|
||||||
|
set(ARM_MCPU "armv8.6-a")
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||||
|
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||||
|
endif()
|
||||||
|
if (GGML_INTERNAL_NOSVE)
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
|
||||||
|
endif()
|
||||||
|
if (GGML_INTERNAL_SME)
|
||||||
|
set(ARM_MCPU "armv9.2-a")
|
||||||
|
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||||
|
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||||
|
endif()
|
||||||
|
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||||
|
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
@ -306,18 +363,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||||
endif()
|
endif()
|
||||||
|
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
||||||
# The feature detection code is compiled as a separate target so that
|
|
||||||
# it can be built without the architecture flags
|
|
||||||
# Since multiple variants of the CPU backend may be included in the same
|
|
||||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
|
||||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
|
||||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp)
|
|
||||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
|
||||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
|
||||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
|
||||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
||||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
|
||||||
endif()
|
endif()
|
||||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,184 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
// Rename `_generic` functions if no native implementation is available.
|
||||||
|
// This effectively selects the generic implementation.
|
||||||
|
|
||||||
|
#if defined(GGML_CPU_GENERIC)
|
||||||
|
// quants.c
|
||||||
|
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||||
|
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||||
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||||
|
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||||
|
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||||
|
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||||
|
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||||
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
|
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||||
|
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||||
|
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||||
|
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||||
|
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||||
|
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#elif defined(__POWERPC__) || defined(__powerpc__)
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||||
|
// quants.c
|
||||||
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#elif defined(__loongarch64)
|
||||||
|
// quants.c
|
||||||
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#elif defined(__riscv)
|
||||||
|
// quants.c
|
||||||
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||||
|
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#elif defined(__s390x__)
|
||||||
|
// quants.c
|
||||||
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||||
|
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||||
|
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||||
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
|
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#elif defined(__wasm__)
|
||||||
|
// quants.c
|
||||||
|
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||||
|
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||||
|
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||||
|
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||||
|
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||||
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||||
|
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||||
|
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||||
|
// repack.cpp
|
||||||
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||||
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||||
|
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||||
|
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||||
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||||
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||||
|
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||||
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||||
|
#endif
|
||||||
|
|
@ -0,0 +1,94 @@
|
||||||
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
|
||||||
|
#if defined(__linux__)
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(HWCAP2_I8MM)
|
||||||
|
#define HWCAP2_I8MM (1 << 13)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(HWCAP2_SME)
|
||||||
|
#define HWCAP2_SME (1 << 23)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct aarch64_features {
|
||||||
|
// has_neon not needed, aarch64 has NEON guaranteed
|
||||||
|
bool has_dotprod = false;
|
||||||
|
bool has_fp16_va = false;
|
||||||
|
bool has_sve = false;
|
||||||
|
bool has_sve2 = false;
|
||||||
|
bool has_i8mm = false;
|
||||||
|
bool has_sme = false;
|
||||||
|
|
||||||
|
aarch64_features() {
|
||||||
|
#if defined(__linux__)
|
||||||
|
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||||
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||||
|
|
||||||
|
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||||
|
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
||||||
|
has_sve = !!(hwcap & HWCAP_SVE);
|
||||||
|
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
||||||
|
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||||
|
has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
int oldp = 0;
|
||||||
|
size_t size = sizeof(oldp);
|
||||||
|
|
||||||
|
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
||||||
|
has_dotprod = static_cast<bool>(oldp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
||||||
|
has_i8mm = static_cast<bool>(oldp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
||||||
|
has_sme = static_cast<bool>(oldp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apple apparently does not implement SVE yet
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static int ggml_backend_cpu_aarch64_score() {
|
||||||
|
int score = 1;
|
||||||
|
aarch64_features af;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_DOTPROD
|
||||||
|
if (!af.has_dotprod) { return 0; }
|
||||||
|
score += 1<<1;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
||||||
|
if (!af.has_fp16_va) { return 0; }
|
||||||
|
score += 1<<2;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_SVE
|
||||||
|
if (!af.has_sve) { return 0; }
|
||||||
|
score += 1<<3;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_MATMUL_INT8
|
||||||
|
if (!af.has_i8mm) { return 0; }
|
||||||
|
score += 1<<4;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_SVE2
|
||||||
|
if (!af.has_sve2) { return 0; }
|
||||||
|
score += 1<<5;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_SME
|
||||||
|
if (!af.has_sme) { return 0; }
|
||||||
|
score += 1<<6;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
||||||
|
|
||||||
|
# endif // defined(__aarch64__)
|
||||||
|
|
@ -371,7 +371,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||||
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
|
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
||||||
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
|
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
|
||||||
|
|
||||||
typedef int8_t int8x16_t __attribute__((vector_size(16)));
|
typedef int8_t int8x16_t __attribute__((vector_size(16)));
|
||||||
|
|
@ -382,10 +382,10 @@ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
|
||||||
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
|
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
|
||||||
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
||||||
|
|
||||||
typedef float float32x4_t __attribute__((vector_size(16)));
|
typedef float float32x4_t __attribute__((vector_size(16)));
|
||||||
typedef double double64x2_t __attribute((vector_size(16)));
|
typedef double double64x2_t __attribute__((vector_size(16)));
|
||||||
|
|
||||||
typedef signed long long long64x2_t __attribute((vector_size(16)));
|
typedef signed long long long64x2_t __attribute__((vector_size(16)));
|
||||||
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
|
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
|
||||||
|
|
||||||
typedef struct ggml_uint8x16x2_t {
|
typedef struct ggml_uint8x16x2_t {
|
||||||
|
|
@ -503,31 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
||||||
// TODO: move to ggml-threading
|
// TODO: move to ggml-threading
|
||||||
void ggml_barrier(struct ggml_threadpool * tp);
|
void ggml_barrier(struct ggml_threadpool * tp);
|
||||||
|
|
||||||
|
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
|
||||||
|
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_DO_PRAGMA_(x) _Pragma (#x)
|
|
||||||
#define GGML_DO_PRAGMA(x) GGML_DO_PRAGMA_(x)
|
|
||||||
#if defined(GGML_CPU_GENERIC) || defined(__HIPCC__)
|
|
||||||
// Note for Apple targets:
|
|
||||||
// - clang: aliases are not supported on darwin
|
|
||||||
// - all native kernels need to be implemented in both x86 and arm files
|
|
||||||
// - on iOS, tvOS, and visionOS, if cmake cannot determine the target architecture, all `_generic` names are replaced by defines
|
|
||||||
# define GGML_WEAK_ALIAS(name, alias)
|
|
||||||
#elif defined(__GNUC__)
|
|
||||||
// GCC/Clang on *nix
|
|
||||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT
|
|
||||||
#elif defined(_MSC_VER) && defined(_WIN64)
|
|
||||||
// MSVC
|
|
||||||
// Note: C name mangling varies across different calling conventions
|
|
||||||
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
|
|
||||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
|
|
||||||
#elif defined(_MSC_VER) && defined(WIN32)
|
|
||||||
// ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
|
|
||||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
|
|
||||||
#else
|
|
||||||
# error "Unsupported compiler for GGML_WEAK_ALIAS"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_CPU_NATIVE_IMPL(name) GGML_WEAK_ALIAS(name, name ## _generic)
|
|
||||||
|
|
|
||||||
|
|
@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
|
||||||
|
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
|
||||||
|
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__gnu_linux__)
|
||||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||||
cpu_set_t cpuset;
|
cpu_set_t cpuset;
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,6 @@
|
||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
|
||||||
#include <atomic>
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
|
|
@ -394,8 +393,6 @@ class tinyBLAS {
|
||||||
|
|
||||||
template <int RM, int RN, int BM>
|
template <int RM, int RN, int BM>
|
||||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||||
static std::atomic<int64_t> current_chunk;
|
|
||||||
|
|
||||||
GGML_ASSERT(m % (RM * BM) == 0);
|
GGML_ASSERT(m % (RM * BM) == 0);
|
||||||
const int64_t ytiles = m / (RM * BM);
|
const int64_t ytiles = m / (RM * BM);
|
||||||
const int64_t xtiles = (n + RN -1) / RN;
|
const int64_t xtiles = (n + RN -1) / RN;
|
||||||
|
|
@ -410,7 +407,7 @@ class tinyBLAS {
|
||||||
if (params->ith == 0) {
|
if (params->ith == 0) {
|
||||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
|
|
@ -439,8 +436,7 @@ class tinyBLAS {
|
||||||
GGML_ASSERT(jj == jj2);
|
GGML_ASSERT(jj == jj2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// next step.
|
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,8 @@
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "quants.h"
|
#include "quants.h"
|
||||||
|
|
||||||
|
#include "arch-fallback.h"
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
|
|
@ -38,12 +40,10 @@ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||||
quantize_row_q8_0_ref(x, y, k);
|
quantize_row_q8_0_ref(x, y, k);
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(quantize_row_q8_0)
|
|
||||||
|
|
||||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||||
quantize_row_q8_1_ref(x, y, k);
|
quantize_row_q8_1_ref(x, y, k);
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(quantize_row_q8_1)
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// 2-6 bit quantization in super-blocks
|
// 2-6 bit quantization in super-blocks
|
||||||
|
|
@ -104,7 +104,6 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
|
||||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||||
quantize_row_q8_K_ref(x, y, k);
|
quantize_row_q8_K_ref(x, y, k);
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(quantize_row_q8_K)
|
|
||||||
|
|
||||||
//===================================== Dot products =================================
|
//===================================== Dot products =================================
|
||||||
|
|
||||||
|
|
@ -143,7 +142,6 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q4_0_q8_0)
|
|
||||||
|
|
||||||
// TODO: add WASM SIMD
|
// TODO: add WASM SIMD
|
||||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
|
|
@ -181,7 +179,6 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q4_1_q8_1)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -225,7 +222,6 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q5_0_q8_0)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_1;
|
const int qk = QK8_1;
|
||||||
|
|
@ -269,7 +265,6 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q5_1_q8_1)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -300,7 +295,6 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q8_0_q8_0)
|
|
||||||
|
|
||||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
|
|
@ -353,7 +347,6 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_tq1_0_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
|
|
@ -386,7 +379,6 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_tq2_0_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
|
|
@ -439,7 +431,6 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
}
|
}
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q2_K_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -519,7 +510,6 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q3_K_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -595,7 +585,6 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q4_K_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -676,7 +665,6 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q5_K_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -732,7 +720,6 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q6_K_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -775,7 +762,6 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||||
}
|
}
|
||||||
*s = 0.125f * sumf;
|
*s = 0.125f * sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq2_xxs_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -826,7 +812,6 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
*s = 0.125f * sumf;
|
*s = 0.125f * sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq2_xs_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -879,7 +864,6 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
|
|
||||||
*s = 0.125f * sumf;
|
*s = 0.125f * sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq2_s_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -924,7 +908,6 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||||
}
|
}
|
||||||
*s = 0.25f * sumf;
|
*s = 0.25f * sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq3_xxs_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -981,7 +964,6 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq3_s_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -1025,7 +1007,6 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq1_s_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(n % QK_K == 0);
|
assert(n % QK_K == 0);
|
||||||
|
|
@ -1087,7 +1068,6 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq1_m_q8_K)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
|
|
@ -1117,7 +1097,6 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq4_nl_q8_0)
|
|
||||||
|
|
||||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||||
assert(nrc == 1);
|
assert(nrc == 1);
|
||||||
|
|
@ -1164,7 +1143,6 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq4_xs_q8_K)
|
|
||||||
|
|
||||||
// ============================ 4-bit non-linear quants
|
// ============================ 4-bit non-linear quants
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -84,33 +84,6 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
#if defined(GGML_CPU_GENERIC)
|
|
||||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
|
||||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
|
||||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
|
||||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
|
||||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
|
||||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
|
||||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
|
||||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
|
||||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
|
||||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
|
||||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
|
||||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
|
||||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
|
||||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
|
||||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
|
||||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
|
||||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
|
||||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
|
||||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
|
||||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
|
||||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
|
||||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
|
||||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
|
||||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@
|
||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
#include "traits.h"
|
#include "traits.h"
|
||||||
|
|
||||||
|
#include "arch-fallback.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
@ -83,7 +85,6 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_0_4x4)
|
|
||||||
|
|
||||||
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32);
|
||||||
|
|
@ -122,7 +123,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_0_4x8)
|
|
||||||
|
|
||||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||||
assert(QK_K == 256);
|
assert(QK_K == 256);
|
||||||
|
|
@ -174,7 +174,6 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_K_4x8)
|
|
||||||
|
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|
||||||
|
|
@ -244,7 +243,6 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_4x4_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -289,7 +287,6 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_4x8_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -336,7 +333,6 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_8x8_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK_K;
|
const int qk = QK_K;
|
||||||
|
|
@ -415,7 +411,6 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_K_8x8_q8_K)
|
|
||||||
|
|
||||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -462,7 +457,6 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_iq4_nl_4x4_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -519,7 +513,6 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_4x4_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -574,7 +567,6 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_4x8_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -629,7 +621,6 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_8x8_q8_0)
|
|
||||||
|
|
||||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK_K;
|
const int qk = QK_K;
|
||||||
|
|
@ -719,7 +710,6 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_K_8x8_q8_K)
|
|
||||||
|
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
|
|
@ -776,7 +766,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_iq4_nl_4x4_q8_0)
|
|
||||||
|
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -64,10 +64,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Workaround for clang:
|
|
||||||
// clang++ complains: ``error: call to 'ggml_gemm_q4_0_4x4_q8_0' is ambiguous''
|
|
||||||
// repro: https://godbolt.org/z/oKdeWKonM (ICE), https://godbolt.org/z/1szq6P36v (ambiguous call)
|
|
||||||
#if defined(GGML_CPU_CLANG_WORKAROUND) || !(defined(__GNUC__) && defined(__clang__)) || defined(__HIPCC__)
|
|
||||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
|
@ -81,7 +77,6 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
#endif // !defined(__clang__)
|
|
||||||
|
|
||||||
// Native implementations
|
// Native implementations
|
||||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||||
|
|
@ -98,22 +93,6 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||||
|
|
||||||
#if defined(GGML_CPU_GENERIC)
|
|
||||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
||||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
||||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
|
||||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
|
||||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
||||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
|
||||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
|
||||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
||||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
||||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
||||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
||||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
|
||||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -944,10 +944,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
x[i] = vec_add(x[i], x[offset + i]); \
|
x[i] = vec_add(x[i], x[offset + i]); \
|
||||||
} \
|
} \
|
||||||
res = vec_extract(x[0], 0) + \
|
float32x4_t tmp = x[0] + vec_reve(x[0]); \
|
||||||
vec_extract(x[0], 1) + \
|
res = tmp[0] + tmp[1]; \
|
||||||
vec_extract(x[0], 2) + \
|
|
||||||
vec_extract(x[0], 3); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
|
|
||||||
|
|
@ -207,9 +207,9 @@ typedef float2 dfloat2;
|
||||||
#define FP16_MMA_AVAILABLE
|
#define FP16_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||||
|
|
||||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||||
#define FP16_MMA_AVAILABLE
|
#define FP16_MMA_AVAILABLE
|
||||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||||
|
|
||||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||||
#define NEW_MMA_AVAILABLE
|
#define NEW_MMA_AVAILABLE
|
||||||
|
|
@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||||
return __AMDGCN_WAVEFRONT_SIZE;
|
return 64;
|
||||||
#else
|
#else
|
||||||
return 32;
|
return 32;
|
||||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||||
}
|
}
|
||||||
|
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
|
|
|
||||||
|
|
@ -2664,7 +2664,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#else
|
||||||
|
GGML_UNUSED(integrated);
|
||||||
|
#endif // NDEBUG
|
||||||
|
|
||||||
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||||
float * __restrict__ dst, const int64_t L) {
|
float * __restrict__ dst, const int64_t L) {
|
||||||
GGML_UNUSED(src1_nb0);
|
GGML_UNUSED(src1_nb0);
|
||||||
GGML_UNUSED(src2_nb0);
|
GGML_UNUSED(src2_nb0);
|
||||||
|
|
||||||
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
const int bidx = blockIdx.x; // split along B
|
const int bidx = blockIdx.x; // split along B
|
||||||
const int bidy = blockIdx.y; // split along D
|
const int bidy = blockIdx.y; // split along D
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
|
@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||||
if (N == 16) {
|
if (N == 16) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
|
||||||
// todo: bank conflict
|
// todo: bank conflict
|
||||||
// I am always confused with how to use the swizzling method to solve
|
// I am always confused with how to use the swizzling method to solve
|
||||||
// bank conflit. Hoping somebody can tell me.
|
// bank conflit. Hoping somebody can tell me.
|
||||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||||
}
|
}
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
|
||||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
|
||||||
|
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT GGML_CUDA_FA)
|
if (NOT GGML_CUDA_FA)
|
||||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY)
|
||||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${METALLIB_EMBED_ASM}
|
OUTPUT "${METALLIB_EMBED_ASM}"
|
||||||
COMMAND echo "Embedding Metal library"
|
COMMAND echo "Embedding Metal library"
|
||||||
COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP}
|
COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}"
|
||||||
COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
|
COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
|
||||||
COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM}
|
COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}"
|
||||||
COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM}
|
COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}"
|
||||||
COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM}
|
COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}"
|
||||||
COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
|
COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}"
|
||||||
COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM}
|
COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}"
|
||||||
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM}
|
COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}"
|
||||||
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
|
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
|
||||||
COMMENT "Generate assembly for embedded Metal library"
|
COMMENT "Generate assembly for embedded Metal library"
|
||||||
|
VERBATIM
|
||||||
)
|
)
|
||||||
|
|
||||||
target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
|
target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
|
||||||
else()
|
else()
|
||||||
if (GGML_METAL_SHADER_DEBUG)
|
if (GGML_METAL_SHADER_DEBUG)
|
||||||
# custom command to do the following:
|
# custom command to do the following:
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
|
||||||
mul_mv_q4_0_f32_1d_8x_flat
|
mul_mv_q4_0_f32_1d_8x_flat
|
||||||
mul_mv_q4_0_f32_1d_16x_flat
|
mul_mv_q4_0_f32_1d_16x_flat
|
||||||
mul_mv_q6_k
|
mul_mv_q6_k
|
||||||
|
mul_mv_id_q4_0_f32_8x_flat
|
||||||
mul
|
mul
|
||||||
norm
|
norm
|
||||||
relu
|
relu
|
||||||
|
|
|
||||||
|
|
@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
|
||||||
cl_program program_upscale;
|
cl_program program_upscale;
|
||||||
cl_program program_concat;
|
cl_program program_concat;
|
||||||
cl_program program_tsembd;
|
cl_program program_tsembd;
|
||||||
|
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||||
|
|
||||||
cl_kernel kernel_add, kernel_add_row;
|
cl_kernel kernel_add, kernel_add_row;
|
||||||
cl_kernel kernel_mul, kernel_mul_row;
|
cl_kernel kernel_mul, kernel_mul_row;
|
||||||
|
|
@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
|
||||||
cl_kernel kernel_concat_f32_contiguous;
|
cl_kernel kernel_concat_f32_contiguous;
|
||||||
cl_kernel kernel_concat_f32_non_contiguous;
|
cl_kernel kernel_concat_f32_non_contiguous;
|
||||||
cl_kernel kernel_timestep_embedding;
|
cl_kernel kernel_timestep_embedding;
|
||||||
|
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||||
|
|
||||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||||
// Transpose kernels
|
// Transpose kernels
|
||||||
|
|
@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
GGML_LOG_CONT(".");
|
GGML_LOG_CONT(".");
|
||||||
}
|
}
|
||||||
|
|
||||||
// repeat
|
// repeat
|
||||||
{
|
{
|
||||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
const std::string kernel_src {
|
const std::string kernel_src {
|
||||||
|
|
@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mul_mv_id_q4_0_f32_8x_flat
|
||||||
|
{
|
||||||
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||||
|
const std::string kernel_src {
|
||||||
|
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
||||||
|
#endif
|
||||||
|
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
||||||
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||||
|
|
||||||
|
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
||||||
|
GGML_LOG_CONT(".");
|
||||||
|
}
|
||||||
|
|
||||||
// Adreno kernels
|
// Adreno kernels
|
||||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||||
// transpose
|
// transpose
|
||||||
|
|
@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
||||||
|
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||||
|
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
|
|
@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
GGML_ASSERT(src0);
|
||||||
|
GGML_ASSERT(src0->extra);
|
||||||
|
GGML_ASSERT(src1);
|
||||||
|
GGML_ASSERT(src1->extra);
|
||||||
|
GGML_ASSERT(dst);
|
||||||
|
GGML_ASSERT(dst->extra);
|
||||||
|
|
||||||
|
const ggml_tensor * src2 = dst->src[2];
|
||||||
|
GGML_ASSERT(src2);
|
||||||
|
GGML_ASSERT(src2->extra);
|
||||||
|
|
||||||
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||||
|
cl_command_queue queue = backend_ctx->queue;
|
||||||
|
|
||||||
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||||
|
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
||||||
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||||
|
|
||||||
|
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||||
|
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||||
|
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||||
|
|
||||||
|
#ifdef GGML_OPENCL_SOA_Q
|
||||||
|
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int ne00 = src0->ne[0];
|
||||||
|
const int ne01 = src0->ne[1];
|
||||||
|
const int ne02 = src0->ne[2];
|
||||||
|
const int ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb00 = src0->nb[0];
|
||||||
|
const cl_ulong nb02 = src0->nb[2];
|
||||||
|
|
||||||
|
const int ne10 = src1->ne[0];
|
||||||
|
const int ne11 = src1->ne[1];
|
||||||
|
const int ne12 = src1->ne[2];
|
||||||
|
const int ne13 = src1->ne[3];
|
||||||
|
|
||||||
|
const cl_ulong nb11 = src1->nb[1];
|
||||||
|
const cl_ulong nb12 = src1->nb[2];
|
||||||
|
|
||||||
|
const int ne20 = src2->ne[0];
|
||||||
|
const int ne21 = src2->ne[1];
|
||||||
|
|
||||||
|
const cl_ulong nb21 = src2->nb[1];
|
||||||
|
|
||||||
|
const int ne0 = dst->ne[0];
|
||||||
|
const int ne1 = dst->ne[1];
|
||||||
|
|
||||||
|
const int r2 = ne12/ne02;
|
||||||
|
const int r3 = ne13/ne03;
|
||||||
|
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
||||||
|
|
||||||
|
GGML_ASSERT(ne00 == ne10);
|
||||||
|
|
||||||
|
int sgs = 32; // subgroup size
|
||||||
|
int nsg = 1; // number of subgroups
|
||||||
|
int nrows = 1; // number of row in src1
|
||||||
|
int ndst = 4; // number of values produced by each subgroup
|
||||||
|
|
||||||
|
cl_kernel kernel;
|
||||||
|
|
||||||
|
// subgroup mat vec
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_Q4_0: {
|
||||||
|
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||||
|
|
||||||
|
if (backend_ctx->gpu_family == INTEL) {
|
||||||
|
sgs = 16;
|
||||||
|
nsg = 1;
|
||||||
|
ndst = 8;
|
||||||
|
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||||
|
sgs = 64;
|
||||||
|
nsg = 1;
|
||||||
|
ndst = 8;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||||
|
}
|
||||||
|
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
||||||
|
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "not implemented");;
|
||||||
|
}
|
||||||
|
|
||||||
|
int _ne1 = 1;
|
||||||
|
int ne123 = dst_rows;
|
||||||
|
|
||||||
|
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
||||||
|
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
||||||
|
|
||||||
|
#ifdef GGML_OPENCL_PROFILING
|
||||||
|
cl_event evt;
|
||||||
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||||
|
|
||||||
|
g_profiling_info.emplace_back();
|
||||||
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||||
|
#else
|
||||||
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src0);
|
GGML_ASSERT(src0);
|
||||||
GGML_ASSERT(src0->extra);
|
GGML_ASSERT(src0->extra);
|
||||||
|
|
@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||||
}
|
}
|
||||||
func = ggml_cl_mul_mat;
|
func = ggml_cl_mul_mat;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_MUL_MAT_ID:
|
||||||
|
if (!any_on_device) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
func = ggml_cl_mul_mat_id;
|
||||||
|
break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
if (!any_on_device) {
|
if (!any_on_device) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,283 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#ifdef cl_intel_subgroups
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_intel_required_subgroup_size
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||||
|
#define INTEL_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||||
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||||
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define ADRENO_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
|
||||||
|
typedef char int8_t;
|
||||||
|
typedef uchar uint8_t;
|
||||||
|
typedef short int16_t;
|
||||||
|
typedef ushort uint16_t;
|
||||||
|
typedef int int32_t;
|
||||||
|
typedef uint uint32_t;
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// block_q4_0
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
struct block_q4_0
|
||||||
|
{
|
||||||
|
half d;
|
||||||
|
uint8_t qs[QK4_0 / 2];
|
||||||
|
};
|
||||||
|
|
||||||
|
// This function requires the original shuffled weights.
|
||||||
|
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
||||||
|
// packed together in a byte, so are (q[1], q[17]) and so on.
|
||||||
|
inline float block_q_4_0_dot_y_flat(
|
||||||
|
global uchar * x,
|
||||||
|
global half * dh,
|
||||||
|
float sumy,
|
||||||
|
float16 yl,
|
||||||
|
int il
|
||||||
|
) {
|
||||||
|
float d = *dh;
|
||||||
|
global ushort * qs = ((global ushort *)x + il/2);
|
||||||
|
float acc = 0.f;
|
||||||
|
|
||||||
|
acc += yl.s0 * (qs[0] & 0x000F);
|
||||||
|
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||||
|
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||||
|
acc += yl.s9 * (qs[0] & 0xF000);
|
||||||
|
|
||||||
|
acc += yl.s2 * (qs[1] & 0x000F);
|
||||||
|
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||||
|
acc += yl.sa * (qs[1] & 0x00F0);
|
||||||
|
acc += yl.sb * (qs[1] & 0xF000);
|
||||||
|
|
||||||
|
acc += yl.s4 * (qs[2] & 0x000F);
|
||||||
|
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||||
|
acc += yl.sc * (qs[2] & 0x00F0);
|
||||||
|
acc += yl.sd * (qs[2] & 0xF000);
|
||||||
|
|
||||||
|
acc += yl.s6 * (qs[3] & 0x000F);
|
||||||
|
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||||
|
acc += yl.se * (qs[3] & 0x00F0);
|
||||||
|
acc += yl.sf * (qs[3] & 0xF000);
|
||||||
|
|
||||||
|
return d * (sumy * -8.f + acc);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// This variant outputs 8 values.
|
||||||
|
//
|
||||||
|
#undef N_DST
|
||||||
|
#undef N_SIMDGROUP
|
||||||
|
#undef N_SIMDWIDTH
|
||||||
|
|
||||||
|
#ifdef INTEL_GPU
|
||||||
|
#define N_DST 8 // each SIMD group works on 8 rows
|
||||||
|
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||||
|
#define N_SIMDWIDTH 16 // subgroup size
|
||||||
|
#elif defined (ADRENO_GPU)
|
||||||
|
#define N_DST 8
|
||||||
|
#define N_SIMDGROUP 1
|
||||||
|
#define N_SIMDWIDTH 64
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void mul_vec_q_n_f32_8x_flat(
|
||||||
|
global char * src0_q,
|
||||||
|
global half * src0_d,
|
||||||
|
global float * src1,
|
||||||
|
global float * dst,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
int ne10,
|
||||||
|
int ne12,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int r2,
|
||||||
|
int r3
|
||||||
|
) {
|
||||||
|
const ulong nb = ne00/QK4_0;
|
||||||
|
|
||||||
|
int r0 = get_group_id(0);
|
||||||
|
int r1 = get_group_id(1);
|
||||||
|
int im = 0;
|
||||||
|
|
||||||
|
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||||
|
|
||||||
|
int i12 = im%ne12;
|
||||||
|
int i13 = im/ne12;
|
||||||
|
|
||||||
|
// The number of scales is the same as the number of blocks.
|
||||||
|
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||||
|
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||||
|
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||||
|
|
||||||
|
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||||
|
global half * d = (global half *) src0_d + offset0_d;
|
||||||
|
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||||
|
|
||||||
|
float16 yl;
|
||||||
|
float8 sumf = 0.f;
|
||||||
|
|
||||||
|
int ix = get_sub_group_local_id()/2;
|
||||||
|
int il = 8*(get_sub_group_local_id()%2);
|
||||||
|
|
||||||
|
global float * yb = y + ix*QK4_0 + il;
|
||||||
|
|
||||||
|
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||||
|
float sumy = 0.f;
|
||||||
|
|
||||||
|
sumy += yb[0];
|
||||||
|
sumy += yb[1];
|
||||||
|
sumy += yb[2];
|
||||||
|
sumy += yb[3];
|
||||||
|
sumy += yb[4];
|
||||||
|
sumy += yb[5];
|
||||||
|
sumy += yb[6];
|
||||||
|
sumy += yb[7];
|
||||||
|
|
||||||
|
sumy += yb[16];
|
||||||
|
sumy += yb[17];
|
||||||
|
sumy += yb[18];
|
||||||
|
sumy += yb[19];
|
||||||
|
sumy += yb[20];
|
||||||
|
sumy += yb[21];
|
||||||
|
sumy += yb[22];
|
||||||
|
sumy += yb[23];
|
||||||
|
|
||||||
|
yl.s0 = yb[0];
|
||||||
|
yl.s1 = yb[1]/256.f;
|
||||||
|
|
||||||
|
yl.s2 = yb[2];
|
||||||
|
yl.s3 = yb[3]/256.f;
|
||||||
|
|
||||||
|
yl.s4 = yb[4];
|
||||||
|
yl.s5 = yb[5]/256.f;
|
||||||
|
|
||||||
|
yl.s6 = yb[6];
|
||||||
|
yl.s7 = yb[7]/256.f;
|
||||||
|
|
||||||
|
yl.s8 = yb[16]/16.f;
|
||||||
|
yl.s9 = yb[17]/4096.f;
|
||||||
|
|
||||||
|
yl.sa = yb[18]/16.f;
|
||||||
|
yl.sb = yb[19]/4096.f;
|
||||||
|
|
||||||
|
yl.sc = yb[20]/16.f;
|
||||||
|
yl.sd = yb[21]/4096.f;
|
||||||
|
|
||||||
|
yl.se = yb[22]/16.f;
|
||||||
|
yl.sf = yb[23]/4096.f;
|
||||||
|
|
||||||
|
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||||
|
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||||
|
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||||
|
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||||
|
|
||||||
|
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||||
|
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||||
|
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||||
|
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||||
|
|
||||||
|
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||||
|
}
|
||||||
|
|
||||||
|
float8 tot = (float8)(
|
||||||
|
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||||
|
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||||
|
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||||
|
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (get_sub_group_local_id() == 0) {
|
||||||
|
if (first_row + 0 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||||
|
}
|
||||||
|
if (first_row + 1 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||||
|
}
|
||||||
|
if (first_row + 2 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||||
|
}
|
||||||
|
if (first_row + 3 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (first_row + 4 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||||
|
}
|
||||||
|
if (first_row + 5 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||||
|
}
|
||||||
|
if (first_row + 6 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||||
|
}
|
||||||
|
if (first_row + 7 < ne01) {
|
||||||
|
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef INTEL_GPU
|
||||||
|
REQD_SUBGROUP_SIZE_16
|
||||||
|
#elif defined (ADRENO_GPU)
|
||||||
|
REQD_SUBGROUP_SIZE_64
|
||||||
|
#endif
|
||||||
|
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
||||||
|
global char * src0_q,
|
||||||
|
global half * src0_d,
|
||||||
|
global float * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * src2,
|
||||||
|
ulong offset2,
|
||||||
|
global float * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
int ne00,
|
||||||
|
int ne01,
|
||||||
|
int ne02,
|
||||||
|
ulong nb00,
|
||||||
|
ulong nb02,
|
||||||
|
int ne10,
|
||||||
|
int ne11,
|
||||||
|
int ne12,
|
||||||
|
ulong nb11,
|
||||||
|
ulong nb12,
|
||||||
|
int ne20,
|
||||||
|
int ne21,
|
||||||
|
ulong nb21,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int r2,
|
||||||
|
int r3
|
||||||
|
) {
|
||||||
|
src1 = (global float *)((global char *)src1 + offset1);
|
||||||
|
src2 = (global char *)((global char *)src2 + offset2);
|
||||||
|
dst = (global float *)((global char *)dst + offsetd);
|
||||||
|
|
||||||
|
const int iid1 = get_group_id(2)/ne20;
|
||||||
|
const int idx = get_group_id(2)%ne20;
|
||||||
|
|
||||||
|
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
||||||
|
|
||||||
|
const int i11 = idx%ne11;
|
||||||
|
const int i12 = iid1;
|
||||||
|
|
||||||
|
const int i1 = idx;
|
||||||
|
const int i2 = i12;
|
||||||
|
|
||||||
|
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
||||||
|
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
||||||
|
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
||||||
|
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
||||||
|
|
||||||
|
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||||
|
}
|
||||||
|
|
@ -142,7 +142,7 @@ else()
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
ONEMATH
|
ONEMATH
|
||||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||||
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
|
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(ONEMATH)
|
FetchContent_MakeAvailable(ONEMATH)
|
||||||
# Create alias to match with find_package targets name
|
# Create alias to match with find_package targets name
|
||||||
|
|
|
||||||
|
|
@ -513,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||||
|
|
||||||
bool gpu_has_xmx(sycl::device &dev);
|
bool gpu_has_xmx(sycl::device &dev);
|
||||||
|
|
||||||
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
|
template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
|
||||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||||
return;
|
return "";
|
||||||
}
|
}
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << prefix << "=[";
|
ss << prefix << "=[";
|
||||||
|
|
@ -526,29 +526,26 @@ template <int N, class T> void debug_print_array(const std::string & prefix, con
|
||||||
ss << array[N - 1];
|
ss << array[N - 1];
|
||||||
}
|
}
|
||||||
ss << "]";
|
ss << "]";
|
||||||
GGML_SYCL_DEBUG("%s", ss.str().c_str());
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
|
inline std::string debug_get_tensor_str(const std::string &prefix,
|
||||||
const std::string & suffix = "") {
|
const ggml_tensor *tensor, const std::string &suffix = "") {
|
||||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
std::stringstream ss;
|
||||||
return;
|
if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
|
||||||
}
|
ss << prefix.c_str() << "=";
|
||||||
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
|
||||||
if (tensor) {
|
if (tensor) {
|
||||||
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
|
ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
|
||||||
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
|
ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||||
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
|
ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||||
if (!ggml_is_contiguous(tensor)) {
|
|
||||||
GGML_SYCL_DEBUG(";strided");
|
if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
|
||||||
}
|
if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
|
||||||
if (ggml_is_permuted(tensor)) {
|
|
||||||
GGML_SYCL_DEBUG(";permuted");
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
GGML_SYCL_DEBUG("nullptr");
|
ss << "nullptr";
|
||||||
}
|
}
|
||||||
GGML_SYCL_DEBUG("%s", suffix.c_str());
|
ss << suffix;
|
||||||
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use scope_op_debug_print to log operations coming from running a model
|
// Use scope_op_debug_print to log operations coming from running a model
|
||||||
|
|
@ -564,10 +561,10 @@ struct scope_op_debug_print {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
||||||
debug_print_tensor(" dst", dst);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
|
||||||
if (dst) {
|
if (dst) {
|
||||||
for (std::size_t i = 0; i < num_src; ++i) {
|
for (std::size_t i = 0; i < num_src; ++i) {
|
||||||
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
||||||
|
|
|
||||||
|
|
@ -723,8 +723,7 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
|
||||||
|
|
||||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||||
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
||||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
|
||||||
std::string(" src0 type=") + ggml_type_name(src0->type));
|
|
||||||
const int64_t ne = ggml_nelements(src0);
|
const int64_t ne = ggml_nelements(src0);
|
||||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,9 @@ public:
|
||||||
|
|
||||||
dnnl::primitive_attr primitive_attr;
|
dnnl::primitive_attr primitive_attr;
|
||||||
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||||
|
#ifdef GGML_SYCL_F16
|
||||||
|
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
|
||||||
|
#endif
|
||||||
|
|
||||||
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
||||||
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
||||||
|
|
|
||||||
|
|
@ -347,7 +347,7 @@ static enum ggml_status
|
||||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_tensor *tensor) try {
|
ggml_tensor *tensor) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor, "\n");
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
||||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||||
|
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
|
|
@ -385,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
const void *data, size_t offset,
|
const void *data, size_t offset,
|
||||||
size_t size) try {
|
size_t size) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||||
ggml_sycl_set_device(ctx->device);
|
ggml_sycl_set_device(ctx->device);
|
||||||
|
|
@ -413,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||||
void *data, size_t offset,
|
void *data, size_t offset,
|
||||||
size_t size) try {
|
size_t size) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
|
@ -444,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_tensor *dst) try {
|
ggml_tensor *dst) try {
|
||||||
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": dst=", dst);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
||||||
debug_print_tensor(" src=", src);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
||||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||||
if (is_cpy_supported) {
|
if (is_cpy_supported) {
|
||||||
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
||||||
|
|
@ -525,7 +525,7 @@ catch (sycl::exception const &exc) {
|
||||||
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
||||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
||||||
|
|
@ -805,7 +805,7 @@ static enum ggml_status
|
||||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_tensor *tensor) try {
|
ggml_tensor *tensor) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor, "\n");
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
||||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||||
|
|
||||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||||
|
|
@ -891,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||||
ggml_tensor *tensor, const void *data,
|
ggml_tensor *tensor, const void *data,
|
||||||
size_t offset, size_t size) try {
|
size_t offset, size_t size) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||||
// split tensors must always be set in their entirety at once
|
// split tensors must always be set in their entirety at once
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
|
|
@ -947,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||||
const ggml_tensor *tensor, void *data,
|
const ggml_tensor *tensor, void *data,
|
||||||
size_t offset, size_t size) try {
|
size_t offset, size_t size) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||||
// split tensors must always be set in their entirety at once
|
// split tensors must always be set in their entirety at once
|
||||||
GGML_ASSERT(offset == 0);
|
GGML_ASSERT(offset == 0);
|
||||||
|
|
@ -2127,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
|
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
|
||||||
? (const sycl::half *)src1->data + src1_padded_row_size
|
? (const sycl::half *)src1->data + src1_padded_row_size
|
||||||
: src1_as_f16.get();
|
: src1_as_f16.get();
|
||||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
|
||||||
|
|
||||||
#if GGML_SYCL_DNNL
|
#if GGML_SYCL_DNNL
|
||||||
if (!g_ggml_sycl_disable_dnn) {
|
if (!g_ggml_sycl_disable_dnn) {
|
||||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
|
||||||
" : converting dst to fp32");
|
|
||||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
||||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||||
|
|
||||||
const sycl::half alpha_f16 = 1.0f;
|
const sycl::half alpha_f16 = 1.0f;
|
||||||
const sycl::half beta_f16 = 0.0f;
|
const sycl::half beta_f16 = 0.0f;
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||||
|
|
@ -3866,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||||
const void *data, size_t offset,
|
const void *data, size_t offset,
|
||||||
size_t size) try {
|
size_t size) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
@ -3887,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||||
void *data, size_t offset,
|
void *data, size_t offset,
|
||||||
size_t size) try {
|
size_t size) try {
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": tensor=", tensor);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||||
|
|
@ -3910,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||||
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
||||||
ggml_backend_buffer_is_sycl(src->buffer);
|
ggml_backend_buffer_is_sycl(src->buffer);
|
||||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||||
debug_print_tensor(": dst=", dst);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
||||||
debug_print_tensor(" src=", src);
|
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
||||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||||
if (is_cpy_supported) {
|
if (is_cpy_supported) {
|
||||||
/*
|
/*
|
||||||
|
|
|
||||||
|
|
@ -49,15 +49,7 @@ if (Vulkan_FOUND)
|
||||||
../../include/ggml-vulkan.h
|
../../include/ggml-vulkan.h
|
||||||
)
|
)
|
||||||
|
|
||||||
set(VULKAN_SHADER_GEN_CMAKE_ARGS
|
set(VULKAN_SHADER_GEN_CMAKE_ARGS "")
|
||||||
-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
|
|
||||||
-DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
|
|
||||||
)
|
|
||||||
|
|
||||||
set(VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS "")
|
|
||||||
if (CMAKE_BUILD_TYPE AND CMAKE_BUILD_TYPE MATCHES "Debug|Release|MinSizeRel|RelWithDebInfo")
|
|
||||||
list(APPEND VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS --config=${CMAKE_BUILD_TYPE})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Test all shader extensions
|
# Test all shader extensions
|
||||||
test_shader_extension_support(
|
test_shader_extension_support(
|
||||||
|
|
@ -136,42 +128,45 @@ if (Vulkan_FOUND)
|
||||||
set(HOST_CMAKE_TOOLCHAIN_FILE "")
|
set(HOST_CMAKE_TOOLCHAIN_FILE "")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Always use ExternalProject_Add approach
|
|
||||||
include(ExternalProject)
|
include(ExternalProject)
|
||||||
|
|
||||||
# Add toolchain file if cross-compiling
|
|
||||||
if (CMAKE_CROSSCOMPILING)
|
if (CMAKE_CROSSCOMPILING)
|
||||||
list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
|
list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
|
||||||
message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
|
message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Native build through ExternalProject_Add
|
|
||||||
ExternalProject_Add(
|
ExternalProject_Add(
|
||||||
vulkan-shaders-gen
|
vulkan-shaders-gen
|
||||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
|
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
|
||||||
CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
|
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/$<CONFIG>
|
||||||
BUILD_COMMAND ${CMAKE_COMMAND} --build . ${VULKAN_SHADER_GEN_CMAKE_BUILD_ARGS}
|
-DCMAKE_INSTALL_BINDIR=.
|
||||||
INSTALL_COMMAND ${CMAKE_COMMAND} --install .
|
-DCMAKE_BUILD_TYPE=$<CONFIG>
|
||||||
INSTALL_DIR ${CMAKE_BINARY_DIR}
|
${VULKAN_SHADER_GEN_CMAKE_ARGS}
|
||||||
|
|
||||||
|
BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $<CONFIG>
|
||||||
|
|
||||||
|
# NOTE: When DESTDIR is set using Makefile generators and
|
||||||
|
# "make install" triggers the build step, vulkan-shaders-gen
|
||||||
|
# would be installed into the DESTDIR prefix, so it is unset
|
||||||
|
# to ensure that does not happen.
|
||||||
|
|
||||||
|
INSTALL_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR
|
||||||
|
${CMAKE_COMMAND} --install . --config $<CONFIG>
|
||||||
)
|
)
|
||||||
ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
|
|
||||||
|
|
||||||
set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
|
set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
|
||||||
set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
|
set (_ggml_vk_genshaders_dir "${CMAKE_BINARY_DIR}/$<CONFIG>")
|
||||||
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
|
set (_ggml_vk_genshaders_cmd "${_ggml_vk_genshaders_dir}/vulkan-shaders-gen${_ggml_vk_host_suffix}")
|
||||||
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
|
set (_ggml_vk_header "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp")
|
||||||
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
|
set (_ggml_vk_source "${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp")
|
||||||
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
|
set (_ggml_vk_input_dir "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders")
|
||||||
|
set (_ggml_vk_output_dir "${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv")
|
||||||
|
|
||||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
file(GLOB _ggml_vk_shader_files CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.comp")
|
||||||
set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)
|
|
||||||
|
|
||||||
# Add build and install dependencies for all builds
|
|
||||||
set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
|
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${_ggml_vk_header}
|
OUTPUT ${_ggml_vk_header}
|
||||||
${_ggml_vk_source}
|
${_ggml_vk_source}
|
||||||
|
|
||||||
COMMAND ${_ggml_vk_genshaders_cmd}
|
COMMAND ${_ggml_vk_genshaders_cmd}
|
||||||
--glslc ${Vulkan_GLSLC_EXECUTABLE}
|
--glslc ${Vulkan_GLSLC_EXECUTABLE}
|
||||||
|
|
@ -181,7 +176,9 @@ if (Vulkan_FOUND)
|
||||||
--target-cpp ${_ggml_vk_source}
|
--target-cpp ${_ggml_vk_source}
|
||||||
--no-clean
|
--no-clean
|
||||||
|
|
||||||
DEPENDS ${_ggml_vk_shader_deps}
|
DEPENDS ${_ggml_vk_shader_files}
|
||||||
|
vulkan-shaders-gen
|
||||||
|
|
||||||
COMMENT "Generate vulkan shaders"
|
COMMENT "Generate vulkan shaders"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||||
#define VK_VENDOR_ID_INTEL 0x8086
|
#define VK_VENDOR_ID_INTEL 0x8086
|
||||||
#define VK_VENDOR_ID_NVIDIA 0x10de
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
||||||
|
|
||||||
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
|
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
|
||||||
|
|
||||||
#define GGML_VK_MAX_NODES 8192
|
#define GGML_VK_MAX_NODES 8192
|
||||||
|
|
||||||
|
|
@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||||
|
|
||||||
struct ggml_backend_vk_context;
|
struct ggml_backend_vk_context;
|
||||||
|
|
||||||
struct vk_queue {
|
#define MAX_PARAMETER_COUNT 8
|
||||||
uint32_t queue_family_index;
|
|
||||||
vk::Queue queue;
|
|
||||||
vk::CommandPool pool;
|
|
||||||
uint32_t cmd_buffer_idx;
|
|
||||||
std::vector<vk::CommandBuffer> cmd_buffers;
|
|
||||||
|
|
||||||
vk::PipelineStageFlags stage_flags;
|
|
||||||
|
|
||||||
bool transfer_only;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct vk_pipeline_struct {
|
struct vk_pipeline_struct {
|
||||||
std::string name;
|
std::string name;
|
||||||
vk::ShaderModule shader_module;
|
vk::ShaderModule shader_module;
|
||||||
vk::DescriptorSetLayout dsl;
|
|
||||||
std::vector<vk::DescriptorPool> descriptor_pools;
|
|
||||||
std::vector<vk::DescriptorSet> descriptor_sets;
|
|
||||||
uint32_t descriptor_set_idx;
|
|
||||||
vk::PipelineLayout layout;
|
vk::PipelineLayout layout;
|
||||||
vk::Pipeline pipeline;
|
vk::Pipeline pipeline;
|
||||||
uint32_t push_constant_size;
|
uint32_t push_constant_size;
|
||||||
|
|
@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context {
|
||||||
vk_device device;
|
vk_device device;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct vk_queue;
|
||||||
|
|
||||||
|
// Stores command pool/buffers. There's an instance of this
|
||||||
|
// for each (context,queue) pair and for each (device,queue) pair.
|
||||||
|
struct vk_command_pool {
|
||||||
|
void init(vk_device& device, vk_queue *q_);
|
||||||
|
void destroy(vk::Device& device);
|
||||||
|
|
||||||
|
vk::CommandPool pool;
|
||||||
|
uint32_t cmd_buffer_idx;
|
||||||
|
std::vector<vk::CommandBuffer> cmd_buffers;
|
||||||
|
|
||||||
|
vk_queue *q;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Prevent simultaneous submissions to the same queue.
|
||||||
|
// This could be per vk_queue if we stopped having two vk_queue structures
|
||||||
|
// sharing the same vk::Queue.
|
||||||
|
static std::mutex queue_mutex;
|
||||||
|
|
||||||
|
struct vk_queue {
|
||||||
|
uint32_t queue_family_index;
|
||||||
|
vk::Queue queue;
|
||||||
|
|
||||||
|
vk_command_pool cmd_pool;
|
||||||
|
|
||||||
|
vk::PipelineStageFlags stage_flags;
|
||||||
|
|
||||||
|
bool transfer_only;
|
||||||
|
|
||||||
|
// copy everything except the cmd_pool
|
||||||
|
void copyFrom(vk_queue &other) {
|
||||||
|
queue_family_index = other.queue_family_index;
|
||||||
|
queue = other.queue;
|
||||||
|
stage_flags = other.stage_flags;
|
||||||
|
transfer_only = other.transfer_only;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
||||||
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||||
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
||||||
|
|
@ -341,6 +366,8 @@ struct vk_device_struct {
|
||||||
// set to true to indicate that some shaders need to be compiled after the dryrun
|
// set to true to indicate that some shaders need to be compiled after the dryrun
|
||||||
bool need_compiles {};
|
bool need_compiles {};
|
||||||
|
|
||||||
|
vk::DescriptorSetLayout dsl;
|
||||||
|
|
||||||
vk_matmul_pipeline pipeline_matmul_f32 {};
|
vk_matmul_pipeline pipeline_matmul_f32 {};
|
||||||
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
||||||
vk_matmul_pipeline pipeline_matmul_bf16 {};
|
vk_matmul_pipeline pipeline_matmul_bf16 {};
|
||||||
|
|
@ -458,7 +485,6 @@ struct vk_device_struct {
|
||||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||||
|
|
||||||
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
||||||
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
|
||||||
|
|
||||||
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
||||||
|
|
||||||
|
|
@ -483,10 +509,8 @@ struct vk_device_struct {
|
||||||
|
|
||||||
ggml_vk_destroy_buffer(sync_staging);
|
ggml_vk_destroy_buffer(sync_staging);
|
||||||
|
|
||||||
device.destroyCommandPool(compute_queue.pool);
|
compute_queue.cmd_pool.destroy(device);
|
||||||
if (!single_queue) {
|
transfer_queue.cmd_pool.destroy(device);
|
||||||
device.destroyCommandPool(transfer_queue.pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& pipeline : pipelines) {
|
for (auto& pipeline : pipelines) {
|
||||||
if (pipeline.second.expired()) {
|
if (pipeline.second.expired()) {
|
||||||
|
|
@ -498,10 +522,26 @@ struct vk_device_struct {
|
||||||
}
|
}
|
||||||
pipelines.clear();
|
pipelines.clear();
|
||||||
|
|
||||||
|
device.destroyDescriptorSetLayout(dsl);
|
||||||
|
|
||||||
device.destroy();
|
device.destroy();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
|
||||||
|
cmd_buffer_idx = 0;
|
||||||
|
q = q_;
|
||||||
|
|
||||||
|
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
|
||||||
|
pool = device->device.createCommandPool(command_pool_create_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vk_command_pool::destroy(vk::Device& device) {
|
||||||
|
device.destroyCommandPool(pool);
|
||||||
|
pool = nullptr;
|
||||||
|
cmd_buffers.clear();
|
||||||
|
}
|
||||||
|
|
||||||
struct vk_buffer_struct {
|
struct vk_buffer_struct {
|
||||||
vk::Buffer buffer = VK_NULL_HANDLE;
|
vk::Buffer buffer = VK_NULL_HANDLE;
|
||||||
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
|
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
|
||||||
|
|
@ -819,7 +859,7 @@ struct vk_context_struct {
|
||||||
std::vector<vk_staging_memcpy> in_memcpys;
|
std::vector<vk_staging_memcpy> in_memcpys;
|
||||||
std::vector<vk_staging_memcpy> out_memcpys;
|
std::vector<vk_staging_memcpy> out_memcpys;
|
||||||
|
|
||||||
vk_queue * q;
|
vk_command_pool * p {};
|
||||||
};
|
};
|
||||||
typedef std::shared_ptr<vk_context_struct> vk_context;
|
typedef std::shared_ptr<vk_context_struct> vk_context;
|
||||||
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
|
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
|
||||||
|
|
@ -930,6 +970,14 @@ struct ggml_backend_vk_context {
|
||||||
vk_context_ref transfer_ctx;
|
vk_context_ref transfer_ctx;
|
||||||
|
|
||||||
std::vector<vk_context_ref> tensor_ctxs;
|
std::vector<vk_context_ref> tensor_ctxs;
|
||||||
|
|
||||||
|
std::vector<vk::DescriptorPool> descriptor_pools;
|
||||||
|
std::vector<vk::DescriptorSet> descriptor_sets;
|
||||||
|
uint32_t descriptor_set_idx {};
|
||||||
|
uint32_t pipeline_descriptor_set_requirements {};
|
||||||
|
|
||||||
|
vk_command_pool compute_cmd_pool;
|
||||||
|
vk_command_pool transfer_cmd_pool;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
||||||
|
|
@ -1060,39 +1108,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
|
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
|
||||||
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
||||||
GGML_ASSERT(parameter_count > 0);
|
GGML_ASSERT(parameter_count > 0);
|
||||||
|
GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
|
||||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||||
|
|
||||||
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
||||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||||
|
|
||||||
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
|
||||||
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
|
||||||
for (uint32_t i = 0; i < parameter_count; i++) {
|
|
||||||
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
|
||||||
dsl_binding_flags.push_back({});
|
|
||||||
}
|
|
||||||
|
|
||||||
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
|
||||||
|
|
||||||
vk::PushConstantRange pcr(
|
vk::PushConstantRange pcr(
|
||||||
vk::ShaderStageFlagBits::eCompute,
|
vk::ShaderStageFlagBits::eCompute,
|
||||||
0,
|
0,
|
||||||
pipeline->push_constant_size
|
pipeline->push_constant_size
|
||||||
);
|
);
|
||||||
|
|
||||||
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
|
||||||
{},
|
|
||||||
dsl_binding);
|
|
||||||
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
|
||||||
pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
|
||||||
|
|
||||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
||||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
||||||
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
||||||
|
|
||||||
pipeline->descriptor_set_idx = 0;
|
|
||||||
|
|
||||||
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
|
|
||||||
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
|
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
|
||||||
|
|
||||||
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
|
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
|
||||||
|
|
@ -1167,15 +1195,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
|
|
||||||
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
||||||
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
||||||
for (auto& pool : pipeline->descriptor_pools) {
|
|
||||||
device.destroyDescriptorPool(pool);
|
|
||||||
}
|
|
||||||
pipeline->descriptor_pools.clear();
|
|
||||||
pipeline->descriptor_sets.clear();
|
|
||||||
pipeline->descriptor_set_idx = 0;
|
|
||||||
|
|
||||||
device.destroyDescriptorSetLayout(pipeline->dsl);
|
|
||||||
|
|
||||||
device.destroyPipelineLayout(pipeline->layout);
|
device.destroyPipelineLayout(pipeline->layout);
|
||||||
|
|
||||||
device.destroyShaderModule(pipeline->shader_module);
|
device.destroyShaderModule(pipeline->shader_module);
|
||||||
|
|
@ -1183,97 +1202,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
||||||
device.destroyPipeline(pipeline->pipeline);
|
device.destroyPipeline(pipeline->pipeline);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
|
static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
|
||||||
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||||
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
|
ctx->pipeline_descriptor_set_requirements += n;
|
||||||
if (!pipeline->compiled) {
|
if (!pipeline->compiled) {
|
||||||
pipeline->needed = true;
|
pipeline->needed = true;
|
||||||
device->need_compiles = true;
|
ctx->device->need_compiles = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
|
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
|
||||||
std::lock_guard<std::mutex> guard(device->mutex);
|
|
||||||
|
|
||||||
for (auto& pair : device->pipeline_descriptor_set_requirements) {
|
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
|
||||||
vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
|
// Enough descriptors are available
|
||||||
const uint64_t n = pair.second;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
vk_device& device = ctx->device;
|
||||||
|
|
||||||
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
|
||||||
// Enough descriptors are available
|
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||||
continue;
|
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||||
|
|
||||||
|
while (to_alloc > 0) {
|
||||||
|
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
||||||
|
to_alloc -= alloc_count;
|
||||||
|
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||||
|
|
||||||
|
if (pool_idx >= ctx->descriptor_pools.size()) {
|
||||||
|
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
||||||
|
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
||||||
|
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
|
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
||||||
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
for (uint32_t i = 0; i < alloc_count; i++) {
|
||||||
uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
layouts[i] = device->dsl;
|
||||||
|
|
||||||
while (to_alloc > 0) {
|
|
||||||
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
|
||||||
to_alloc -= alloc_count;
|
|
||||||
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
||||||
|
|
||||||
if (pool_idx >= pipeline->descriptor_pools.size()) {
|
|
||||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
||||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
||||||
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
|
||||||
for (uint32_t i = 0; i < alloc_count; i++) {
|
|
||||||
layouts[i] = pipeline->dsl;
|
|
||||||
}
|
|
||||||
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
|
||||||
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
||||||
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
|
||||||
|
|
||||||
pool_idx++;
|
|
||||||
}
|
}
|
||||||
|
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
||||||
|
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
||||||
|
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
|
||||||
|
|
||||||
|
pool_idx++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
||||||
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
|
|
||||||
pipeline->descriptor_set_idx = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
|
|
||||||
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
||||||
std::lock_guard<std::mutex> guard(device->mutex);
|
|
||||||
|
|
||||||
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
|
||||||
// Reuse command buffer
|
// Reuse command buffer
|
||||||
return q.cmd_buffers[q.cmd_buffer_idx++];
|
return p.cmd_buffers[p.cmd_buffer_idx++];
|
||||||
}
|
}
|
||||||
|
|
||||||
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
||||||
q.pool,
|
p.pool,
|
||||||
vk::CommandBufferLevel::ePrimary,
|
vk::CommandBufferLevel::ePrimary,
|
||||||
1);
|
1);
|
||||||
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||||
auto buf = cmd_buffers.front();
|
auto buf = cmd_buffers.front();
|
||||||
|
|
||||||
q.cmd_buffers.push_back(buf);
|
p.cmd_buffers.push_back(buf);
|
||||||
q.cmd_buffer_idx++;
|
p.cmd_buffer_idx++;
|
||||||
|
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
|
||||||
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
|
||||||
vk_submission s;
|
|
||||||
s.buffer = ggml_vk_create_cmd_buffer(device, q);
|
|
||||||
s.wait_semaphores = std::move(wait_semaphores);
|
|
||||||
s.signal_semaphores = std::move(signal_semaphores);
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||||
if (ctx->seqs.empty()) {
|
if (ctx->seqs.empty()) {
|
||||||
if (fence) {
|
if (fence) {
|
||||||
ctx->q->queue.submit({}, fence);
|
std::lock_guard<std::mutex> guard(queue_mutex);
|
||||||
|
ctx->p->q->queue.submit({}, fence);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -1312,7 +1311,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||||
tl_signal_vals.push_back({});
|
tl_signal_vals.push_back({});
|
||||||
tl_signal_semaphores.push_back({});
|
tl_signal_semaphores.push_back({});
|
||||||
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
||||||
stage_flags[idx].push_back(ctx->q->stage_flags);
|
stage_flags[idx].push_back(ctx->p->q->stage_flags);
|
||||||
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
||||||
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
||||||
}
|
}
|
||||||
|
|
@ -1342,7 +1341,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->q->queue.submit(submit_infos, fence);
|
std::lock_guard<std::mutex> guard(queue_mutex);
|
||||||
|
ctx->p->q->queue.submit(submit_infos, fence);
|
||||||
|
|
||||||
ctx->seqs.clear();
|
ctx->seqs.clear();
|
||||||
}
|
}
|
||||||
|
|
@ -1400,28 +1400,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
|
||||||
q.queue_family_index = queue_family_index;
|
q.queue_family_index = queue_family_index;
|
||||||
q.transfer_only = transfer_only;
|
q.transfer_only = transfer_only;
|
||||||
|
|
||||||
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
q.cmd_pool.init(device, &q);
|
||||||
q.pool = device->device.createCommandPool(command_pool_create_info_compute);
|
|
||||||
|
|
||||||
q.cmd_buffer_idx = 0;
|
|
||||||
|
|
||||||
q.queue = device->device.getQueue(queue_family_index, queue_index);
|
q.queue = device->device.getQueue(queue_family_index, queue_index);
|
||||||
|
|
||||||
q.stage_flags = stage_flags;
|
q.stage_flags = stage_flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
|
||||||
vk_context result = std::make_shared<vk_context_struct>();
|
vk_context result = std::make_shared<vk_context_struct>();
|
||||||
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
||||||
ctx->gc.contexts.emplace_back(result);
|
ctx->gc.contexts.emplace_back(result);
|
||||||
result->q = &q;
|
result->p = &p;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
|
static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
|
||||||
vk_context result = std::make_shared<vk_context_struct>();
|
vk_context result = std::make_shared<vk_context_struct>();
|
||||||
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
||||||
result->q = &q;
|
result->p = &p;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1454,15 +1451,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
||||||
return ctx->gc.events[ctx->event_idx++];
|
return ctx->gc.events[ctx->event_idx++];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
|
static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
|
||||||
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
|
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
|
||||||
std::lock_guard<std::mutex> guard(device->mutex);
|
|
||||||
|
|
||||||
// Requires command buffers to be done
|
// Requires command buffers to be done
|
||||||
device->device.resetCommandPool(q.pool);
|
device->device.resetCommandPool(p.pool);
|
||||||
q.cmd_buffer_idx = 0;
|
p.cmd_buffer_idx = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
||||||
|
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
|
||||||
|
|
||||||
|
// Arbitrary frequency to cleanup/reuse command buffers
|
||||||
|
static constexpr uint32_t cleanup_frequency = 10;
|
||||||
|
|
||||||
|
if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
||||||
|
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
|
||||||
|
}
|
||||||
|
if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
||||||
|
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
||||||
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
||||||
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
||||||
|
|
@ -1481,8 +1492,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
||||||
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
|
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::lock_guard<std::mutex> guard(device->mutex);
|
|
||||||
|
|
||||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||||
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
|
|
@ -1611,11 +1620,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
||||||
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
||||||
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
||||||
|
|
||||||
const bool transfer_queue = ctx->q->transfer_only;
|
const bool transfer_queue = ctx->p->q->transfer_only;
|
||||||
|
|
||||||
ctx->s->buffer.pipelineBarrier(
|
ctx->s->buffer.pipelineBarrier(
|
||||||
ctx->q->stage_flags,
|
ctx->p->q->stage_flags,
|
||||||
ctx->q->stage_flags,
|
ctx->p->q->stage_flags,
|
||||||
{},
|
{},
|
||||||
{ {
|
{ {
|
||||||
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
||||||
|
|
@ -1634,8 +1643,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
||||||
|
|
||||||
ctx->s->buffer.waitEvents(
|
ctx->s->buffer.waitEvents(
|
||||||
events,
|
events,
|
||||||
ctx->q->stage_flags,
|
ctx->p->q->stage_flags,
|
||||||
ctx->q->stage_flags,
|
ctx->p->q->stage_flags,
|
||||||
{},
|
{},
|
||||||
{},
|
{},
|
||||||
{}
|
{}
|
||||||
|
|
@ -3369,6 +3378,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
||||||
|
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
||||||
|
for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
|
||||||
|
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
||||||
|
dsl_binding_flags.push_back({});
|
||||||
|
}
|
||||||
|
|
||||||
|
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
||||||
|
|
||||||
|
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
||||||
|
{},
|
||||||
|
dsl_binding);
|
||||||
|
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
||||||
|
device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
||||||
|
|
||||||
ggml_vk_load_shaders(device);
|
ggml_vk_load_shaders(device);
|
||||||
|
|
||||||
if (!device->single_queue) {
|
if (!device->single_queue) {
|
||||||
|
|
@ -3376,7 +3401,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
||||||
} else {
|
} else {
|
||||||
// TODO: Use pointer or reference to avoid copy
|
// TODO: Use pointer or reference to avoid copy
|
||||||
device->transfer_queue = device->compute_queue;
|
device->transfer_queue.copyFrom(device->compute_queue);
|
||||||
|
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
device->buffer_type = {
|
device->buffer_type = {
|
||||||
|
|
@ -3742,6 +3768,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||||
ctx->fence = ctx->device->device.createFence({});
|
ctx->fence = ctx->device->device.createFence({});
|
||||||
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
||||||
|
|
||||||
|
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
|
||||||
|
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
||||||
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
|
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
|
||||||
|
|
@ -4107,9 +4136,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
|
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
||||||
vk_submission s;
|
vk_submission s;
|
||||||
s.buffer = ggml_vk_create_cmd_buffer(device, q);
|
s.buffer = ggml_vk_create_cmd_buffer(device, p);
|
||||||
if (one_time) {
|
if (one_time) {
|
||||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -4154,10 +4183,10 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
||||||
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
||||||
}
|
}
|
||||||
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
||||||
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
||||||
GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
|
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
||||||
|
|
||||||
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
||||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||||
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
||||||
|
|
||||||
|
|
@ -4194,7 +4223,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
||||||
ggml_vk_ctx_end(subctx);
|
ggml_vk_ctx_end(subctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
|
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
|
||||||
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4395,7 +4424,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
||||||
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
|
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
||||||
|
|
||||||
|
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||||
ggml_vk_ctx_begin(dst->device, subctx);
|
ggml_vk_ctx_begin(dst->device, subctx);
|
||||||
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
||||||
ggml_vk_ctx_end(subctx);
|
ggml_vk_ctx_end(subctx);
|
||||||
|
|
@ -4407,6 +4438,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
||||||
ggml_vk_submit(subctx, dst->device->fence);
|
ggml_vk_submit(subctx, dst->device->fence);
|
||||||
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
||||||
dst->device->device.resetFences({ dst->device->fence });
|
dst->device->device.resetFences({ dst->device->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4483,7 +4515,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
||||||
|
|
||||||
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
||||||
} else {
|
} else {
|
||||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
std::lock_guard<std::mutex> guard(src->device->mutex);
|
||||||
|
|
||||||
|
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
||||||
ggml_vk_ctx_begin(src->device, subctx);
|
ggml_vk_ctx_begin(src->device, subctx);
|
||||||
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
||||||
ggml_vk_ctx_end(subctx);
|
ggml_vk_ctx_end(subctx);
|
||||||
|
|
@ -4491,6 +4525,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
||||||
ggml_vk_submit(subctx, src->device->fence);
|
ggml_vk_submit(subctx, src->device->fence);
|
||||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
||||||
src->device->device.resetFences({ src->device->fence });
|
src->device->device.resetFences({ src->device->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||||
|
|
||||||
for (auto& cpy : subctx->out_memcpys) {
|
for (auto& cpy : subctx->out_memcpys) {
|
||||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||||
|
|
@ -4510,15 +4545,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
||||||
|
|
||||||
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||||
if (src->device == dst->device) {
|
if (src->device == dst->device) {
|
||||||
|
std::lock_guard<std::mutex> guard(src->device->mutex);
|
||||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
||||||
// Copy within the device
|
// Copy within the device
|
||||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
||||||
ggml_vk_ctx_begin(src->device, subctx);
|
ggml_vk_ctx_begin(src->device, subctx);
|
||||||
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
||||||
ggml_vk_ctx_end(subctx);
|
ggml_vk_ctx_end(subctx);
|
||||||
ggml_vk_submit(subctx, src->device->fence);
|
ggml_vk_submit(subctx, src->device->fence);
|
||||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
||||||
src->device->device.resetFences({ src->device->fence });
|
src->device->device.resetFences({ src->device->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||||
} else {
|
} else {
|
||||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
||||||
// Copy device to device
|
// Copy device to device
|
||||||
|
|
@ -4543,7 +4580,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
|
||||||
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
||||||
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
||||||
|
|
||||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
|
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
||||||
|
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||||
ggml_vk_ctx_begin(dst->device, subctx);
|
ggml_vk_ctx_begin(dst->device, subctx);
|
||||||
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
||||||
ggml_vk_ctx_end(subctx);
|
ggml_vk_ctx_end(subctx);
|
||||||
|
|
@ -4551,6 +4589,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
||||||
ggml_vk_submit(subctx, dst->device->fence);
|
ggml_vk_submit(subctx, dst->device->fence);
|
||||||
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
||||||
dst->device->device.resetFences({ dst->device->fence });
|
dst->device->device.resetFences({ dst->device->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
||||||
|
|
@ -4964,18 +5003,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||||
}
|
}
|
||||||
|
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||||
}
|
}
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||||
}
|
}
|
||||||
if (quantize_y) {
|
if (quantize_y) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
|
||||||
}
|
}
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -5157,12 +5196,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||||
|
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||||
}
|
}
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||||
}
|
}
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -5295,7 +5334,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -5384,7 +5423,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -5571,12 +5610,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||||
}
|
}
|
||||||
|
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||||
}
|
}
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -5765,12 +5804,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||||
|
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||||
}
|
}
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||||
}
|
}
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -6090,9 +6129,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
// Request descriptor sets
|
// Request descriptor sets
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -6655,7 +6694,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7036,7 +7075,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||||
GGML_ASSERT(pipeline != nullptr);
|
GGML_ASSERT(pipeline != nullptr);
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7175,7 +7214,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
||||||
GGML_ASSERT(pipeline != nullptr);
|
GGML_ASSERT(pipeline != nullptr);
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -7853,9 +7892,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||||
|
|
||||||
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
||||||
// Resize buffer
|
// Resize buffer
|
||||||
|
|
@ -7870,7 +7909,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||||
ggml_vk_load_shaders(ctx->device);
|
ggml_vk_load_shaders(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||||
|
|
||||||
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||||
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||||
|
|
@ -7912,7 +7951,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||||
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
||||||
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
||||||
|
|
||||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||||
for (size_t i = 0; i < num_it; i++) {
|
for (size_t i = 0; i < num_it; i++) {
|
||||||
ggml_vk_matmul(
|
ggml_vk_matmul(
|
||||||
|
|
@ -7928,6 +7967,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||||
ggml_vk_submit(subctx, ctx->fence);
|
ggml_vk_submit(subctx, ctx->fence);
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||||
|
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||||
|
|
@ -8029,16 +8069,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||||
|
|
||||||
free(d_chk);
|
free(d_chk);
|
||||||
|
|
||||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||||
|
|
||||||
ggml_vk_destroy_buffer(d_X);
|
ggml_vk_destroy_buffer(d_X);
|
||||||
ggml_vk_destroy_buffer(d_Y);
|
ggml_vk_destroy_buffer(d_Y);
|
||||||
ggml_vk_destroy_buffer(d_D);
|
ggml_vk_destroy_buffer(d_D);
|
||||||
|
|
||||||
ggml_pipeline_cleanup(p);
|
|
||||||
ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
|
|
||||||
|
|
||||||
free(x);
|
free(x);
|
||||||
free(y);
|
free(y);
|
||||||
free(d);
|
free(d);
|
||||||
|
|
@ -8116,17 +8153,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||||
ggml_vk_quantize_data(x, qx, ne, quant);
|
ggml_vk_quantize_data(x, qx, ne, quant);
|
||||||
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
||||||
|
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
||||||
|
|
||||||
if (ctx->device->need_compiles) {
|
if (ctx->device->need_compiles) {
|
||||||
ggml_vk_load_shaders(ctx->device);
|
ggml_vk_load_shaders(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||||
|
|
||||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||||
|
|
||||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||||
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
||||||
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
|
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
|
||||||
|
|
@ -8137,6 +8174,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||||
ggml_vk_submit(subctx, ctx->fence);
|
ggml_vk_submit(subctx, ctx->fence);
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||||
|
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
|
@ -8216,17 +8254,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||||
//
|
//
|
||||||
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
|
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
|
||||||
//
|
//
|
||||||
// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
||||||
//
|
//
|
||||||
// if (ctx->device->need_compiles) {
|
// if (ctx->device->need_compiles) {
|
||||||
// ggml_vk_load_shaders(ctx->device);
|
// ggml_vk_load_shaders(ctx->device);
|
||||||
// }
|
// }
|
||||||
//
|
//
|
||||||
// ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
// ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||||
//
|
//
|
||||||
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
|
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
|
||||||
//
|
//
|
||||||
// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
// ggml_vk_ctx_begin(ctx->device, subctx);
|
// ggml_vk_ctx_begin(ctx->device, subctx);
|
||||||
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
||||||
// ggml_vk_ctx_end(subctx);
|
// ggml_vk_ctx_end(subctx);
|
||||||
|
|
@ -8236,6 +8274,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||||
// ggml_vk_submit(subctx, ctx->fence);
|
// ggml_vk_submit(subctx, ctx->fence);
|
||||||
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
|
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
|
||||||
// ctx->device->device.resetFences({ ctx->fence });
|
// ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
// ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||||
//
|
//
|
||||||
// auto end = std::chrono::high_resolution_clock::now();
|
// auto end = std::chrono::high_resolution_clock::now();
|
||||||
//
|
//
|
||||||
|
|
@ -8375,9 +8414,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||||
// y[i] = i % k;
|
// y[i] = i % k;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||||
|
|
||||||
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
||||||
// Resize buffer
|
// Resize buffer
|
||||||
|
|
@ -8388,19 +8427,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (mmq) {
|
if (mmq) {
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
|
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->device->need_compiles) {
|
if (ctx->device->need_compiles) {
|
||||||
ggml_vk_load_shaders(ctx->device);
|
ggml_vk_load_shaders(ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||||
|
|
||||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||||
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
||||||
|
|
||||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||||
if (mmq) {
|
if (mmq) {
|
||||||
for (size_t i = 0; i < num_it; i++) {
|
for (size_t i = 0; i < num_it; i++) {
|
||||||
|
|
@ -8429,6 +8468,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||||
ggml_vk_submit(subctx, ctx->fence);
|
ggml_vk_submit(subctx, ctx->fence);
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||||
|
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
|
@ -8743,7 +8783,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
if (!dryrun) {
|
if (!dryrun) {
|
||||||
if (ctx->compute_ctx.expired()) {
|
if (ctx->compute_ctx.expired()) {
|
||||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
ctx->compute_ctx = compute_ctx;
|
ctx->compute_ctx = compute_ctx;
|
||||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -8797,7 +8837,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
||||||
// do the only thing needed for the dryrun.
|
// do the only thing needed for the dryrun.
|
||||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
|
||||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|
@ -9189,19 +9229,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||||
}
|
}
|
||||||
ctx->gc.temp_buffers.clear();
|
ctx->gc.temp_buffers.clear();
|
||||||
|
|
||||||
for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||||
vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
|
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||||
|
|
||||||
if (plr.expired()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
vk_pipeline pl = plr.lock();
|
|
||||||
ggml_pipeline_cleanup(pl);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
|
||||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
||||||
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
||||||
|
|
@ -9222,7 +9251,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||||
|
|
||||||
ctx->tensor_ctxs.clear();
|
ctx->tensor_ctxs.clear();
|
||||||
ctx->gc.contexts.clear();
|
ctx->gc.contexts.clear();
|
||||||
ctx->device->pipeline_descriptor_set_requirements.clear();
|
ctx->pipeline_descriptor_set_requirements = 0;
|
||||||
|
ctx->descriptor_set_idx = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clean up on backend free
|
// Clean up on backend free
|
||||||
|
|
@ -9249,6 +9279,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||||
|
|
||||||
ctx->device->device.destroyFence(ctx->fence);
|
ctx->device->device.destroyFence(ctx->fence);
|
||||||
ctx->device->device.destroyFence(ctx->almost_ready_fence);
|
ctx->device->device.destroyFence(ctx->almost_ready_fence);
|
||||||
|
|
||||||
|
for (auto& pool : ctx->descriptor_pools) {
|
||||||
|
ctx->device->device.destroyDescriptorPool(pool);
|
||||||
|
}
|
||||||
|
ctx->descriptor_pools.clear();
|
||||||
|
ctx->descriptor_sets.clear();
|
||||||
|
|
||||||
|
ctx->compute_cmd_pool.destroy(ctx->device->device);
|
||||||
|
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ggml_vk_get_device_count() {
|
static int ggml_vk_get_device_count() {
|
||||||
|
|
@ -9515,7 +9554,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||||
|
|
||||||
if (ctx->transfer_ctx.expired()) {
|
if (ctx->transfer_ctx.expired()) {
|
||||||
// Initialize new transfer context
|
// Initialize new transfer context
|
||||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||||
ctx->transfer_ctx = transfer_ctx;
|
ctx->transfer_ctx = transfer_ctx;
|
||||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -9538,7 +9577,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||||
|
|
||||||
if (ctx->transfer_ctx.expired()) {
|
if (ctx->transfer_ctx.expired()) {
|
||||||
// Initialize new transfer context
|
// Initialize new transfer context
|
||||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||||
ctx->transfer_ctx = transfer_ctx;
|
ctx->transfer_ctx = transfer_ctx;
|
||||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -9561,7 +9600,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
|
||||||
|
|
||||||
if (ctx->transfer_ctx.expired()) {
|
if (ctx->transfer_ctx.expired()) {
|
||||||
// Initialize new transfer context
|
// Initialize new transfer context
|
||||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||||
ctx->transfer_ctx = transfer_ctx;
|
ctx->transfer_ctx = transfer_ctx;
|
||||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -9622,7 +9661,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
ggml_vk_load_shaders(ctx->device);
|
ggml_vk_load_shaders(ctx->device);
|
||||||
}
|
}
|
||||||
ggml_vk_preallocate_buffers(ctx);
|
ggml_vk_preallocate_buffers(ctx);
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||||
|
|
||||||
int last_node = cgraph->n_nodes - 1;
|
int last_node = cgraph->n_nodes - 1;
|
||||||
|
|
||||||
|
|
@ -9654,7 +9693,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
||||||
|
|
||||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
ctx->compute_ctx = compute_ctx;
|
ctx->compute_ctx = compute_ctx;
|
||||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
||||||
|
|
@ -9689,7 +9728,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
|
|
||||||
if (vk_perf_logger_enabled) {
|
if (vk_perf_logger_enabled) {
|
||||||
if (ctx->compute_ctx.expired()) {
|
if (ctx->compute_ctx.expired()) {
|
||||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||||
ctx->compute_ctx = compute_ctx;
|
ctx->compute_ctx = compute_ctx;
|
||||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
|
|
@ -25,15 +25,3 @@ add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
||||||
|
|
||||||
# Configure output directories for MSVC builds
|
|
||||||
if(MSVC)
|
|
||||||
# Get the main project's runtime output directory if possible
|
|
||||||
if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
|
|
||||||
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
|
|
||||||
string(TOUPPER ${CONFIG} CONFIG)
|
|
||||||
set_target_properties(${TARGET} PROPERTIES
|
|
||||||
RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
||||||
endforeach()
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
|
||||||
|
|
@ -888,12 +888,6 @@ struct ggml_context {
|
||||||
struct ggml_object * objects_end;
|
struct ggml_object * objects_end;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_context_container {
|
|
||||||
bool used;
|
|
||||||
|
|
||||||
struct ggml_context context;
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// data types
|
// data types
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -292,6 +292,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
NOMIC_BERT_MOE = auto()
|
NOMIC_BERT_MOE = auto()
|
||||||
|
NEO_BERT = auto()
|
||||||
JINA_BERT_V2 = auto()
|
JINA_BERT_V2 = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
|
@ -345,6 +346,8 @@ class MODEL_ARCH(IntEnum):
|
||||||
WAVTOKENIZER_DEC = auto()
|
WAVTOKENIZER_DEC = auto()
|
||||||
PLM = auto()
|
PLM = auto()
|
||||||
BAILINGMOE = auto()
|
BAILINGMOE = auto()
|
||||||
|
DOTS1 = auto()
|
||||||
|
ARCEE = auto()
|
||||||
|
|
||||||
|
|
||||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||||
|
|
@ -574,6 +577,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||||
|
MODEL_ARCH.NEO_BERT: "neo-bert",
|
||||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
|
|
@ -627,6 +631,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
||||||
MODEL_ARCH.PLM: "plm",
|
MODEL_ARCH.PLM: "plm",
|
||||||
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
||||||
|
MODEL_ARCH.DOTS1: "dots1",
|
||||||
|
MODEL_ARCH.ARCEE: "arcee",
|
||||||
}
|
}
|
||||||
|
|
||||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||||
|
|
@ -1082,6 +1088,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.NEO_BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.CLS,
|
||||||
|
MODEL_TENSOR.CLS_OUT,
|
||||||
|
],
|
||||||
MODEL_ARCH.JINA_BERT_V2: [
|
MODEL_ARCH.JINA_BERT_V2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
|
@ -2062,6 +2080,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DOTS1: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.ARCEE: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -271,7 +271,7 @@ class GGUFWriter:
|
||||||
|
|
||||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
|
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
|
||||||
if any(key in kv_data for kv_data in self.kv_data):
|
if any(key in kv_data for kv_data in self.kv_data):
|
||||||
raise ValueError(f'Duplicated key name {key!r}')
|
logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}')
|
||||||
|
|
||||||
self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
|
self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ class TensorNameMap:
|
||||||
"model.embeddings", # rwkv7
|
"model.embeddings", # rwkv7
|
||||||
"model.word_embeddings", # bailingmoe
|
"model.word_embeddings", # bailingmoe
|
||||||
"language_model.model.embed_tokens", # llama4
|
"language_model.model.embed_tokens", # llama4
|
||||||
|
"encoder", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
|
@ -134,6 +135,7 @@ class TensorNameMap:
|
||||||
"rwkv.blocks.{bid}.ln1", # rwkv6
|
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||||
"model.layers.{bid}.ln1", # rwkv7
|
"model.layers.{bid}.ln1", # rwkv7
|
||||||
"model.layers.{bid}.input_layernorm", # llama4
|
"model.layers.{bid}.input_layernorm", # llama4
|
||||||
|
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
|
@ -161,6 +163,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||||
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
||||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||||
|
"transformer_encoder.{bid}.qkv", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
|
|
@ -236,6 +239,7 @@ class TensorNameMap:
|
||||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||||
|
"transformer_encoder.{bid}.wo", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
|
|
@ -276,6 +280,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||||
|
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Post feed-forward norm
|
# Post feed-forward norm
|
||||||
|
|
@ -305,7 +310,7 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_EXP_PROBS_B: (
|
MODEL_TENSOR.FFN_EXP_PROBS_B: (
|
||||||
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
|
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
|
@ -340,6 +345,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||||
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
||||||
|
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
|
@ -422,6 +428,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
||||||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||||
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
||||||
|
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
|
@ -836,12 +843,14 @@ class TensorNameMap:
|
||||||
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
|
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
|
||||||
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||||
"encoder.final_layer_norm", # t5
|
"encoder.final_layer_norm", # t5
|
||||||
|
"layer_norm", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.CLS: (
|
MODEL_TENSOR.CLS: (
|
||||||
"classifier", # jina
|
"classifier", # jina
|
||||||
"classifier.dense", # roberta
|
"classifier.dense", # roberta
|
||||||
"pre_classifier", # distillbert
|
"pre_classifier", # distillbert
|
||||||
|
"dense", # neobert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.CLS_OUT: (
|
MODEL_TENSOR.CLS_OUT: (
|
||||||
|
|
|
||||||
|
|
@ -243,18 +243,21 @@ extern "C" {
|
||||||
|
|
||||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||||
|
|
||||||
// Input data for llama_decode
|
// Input data for llama_encode/llama_decode
|
||||||
// A llama_batch object can contain input about one or many sequences
|
// A llama_batch object can contain input about one or many sequences
|
||||||
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
||||||
//
|
//
|
||||||
// - token : the token ids of the input (used when embd is NULL)
|
// - token : the token ids of the input (used when embd is NULL)
|
||||||
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
||||||
// - pos : the positions of the respective token in the sequence
|
// - pos : the positions of the respective token in the sequence
|
||||||
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
||||||
// - seq_id : the sequence to which the respective token belongs
|
// - seq_id : the sequence to which the respective token belongs
|
||||||
// (if set to NULL, the sequence ID will be assumed to be 0)
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
||||||
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
||||||
// (if set to NULL, only the logits for last token will be returned)
|
// (if set to NULL:
|
||||||
|
// - if embeddings: all tokens are output
|
||||||
|
// - if not: only the last token is output
|
||||||
|
// )
|
||||||
//
|
//
|
||||||
typedef struct llama_batch {
|
typedef struct llama_batch {
|
||||||
int32_t n_tokens;
|
int32_t n_tokens;
|
||||||
|
|
@ -262,8 +265,8 @@ extern "C" {
|
||||||
llama_token * token;
|
llama_token * token;
|
||||||
float * embd;
|
float * embd;
|
||||||
llama_pos * pos;
|
llama_pos * pos;
|
||||||
int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
|
int32_t * n_seq_id;
|
||||||
llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
|
llama_seq_id ** seq_id;
|
||||||
int8_t * logits; // TODO: rename this to "output"
|
int8_t * logits; // TODO: rename this to "output"
|
||||||
} llama_batch;
|
} llama_batch;
|
||||||
|
|
||||||
|
|
@ -961,8 +964,8 @@ extern "C" {
|
||||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether the model is in embeddings mode or not
|
// Set whether the context outputs embeddings or not
|
||||||
// If true, embeddings will be returned but logits will not
|
// TODO: rename to avoid confusion with llama_get_embeddings()
|
||||||
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
|
||||||
// Set whether to use causal attention or not
|
// Set whether to use causal attention or not
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
tabulate~=0.9.0
|
tabulate~=0.9.0
|
||||||
GitPython~=3.1.43
|
GitPython~=3.1.43
|
||||||
|
matplotlib~=3.10.0
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@ except ImportError as e:
|
||||||
print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
|
print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("compare-llama-bench")
|
logger = logging.getLogger("compare-llama-bench")
|
||||||
|
|
||||||
# All llama-bench SQL fields
|
# All llama-bench SQL fields
|
||||||
|
|
@ -122,11 +123,15 @@ help_s = (
|
||||||
parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
|
parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
|
||||||
parser.add_argument("-s", "--show", help=help_s)
|
parser.add_argument("-s", "--show", help=help_s)
|
||||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
parser.add_argument("--plot", help="generate a performance comparison plot and save to specified file (e.g., plot.png)")
|
||||||
|
parser.add_argument("--plot_x", help="parameter to use as x axis for plotting (default: n_depth)", default="n_depth")
|
||||||
|
parser.add_argument("--plot_log_scale", action="store_true", help="use log scale for x axis in plots (off by default)")
|
||||||
|
|
||||||
known_args, unknown_args = parser.parse_known_args()
|
known_args, unknown_args = parser.parse_known_args()
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
if known_args.check:
|
if known_args.check:
|
||||||
# Check if all required Python libraries are installed. Would have failed earlier if not.
|
# Check if all required Python libraries are installed. Would have failed earlier if not.
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
@ -499,7 +504,6 @@ else:
|
||||||
|
|
||||||
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
||||||
|
|
||||||
|
|
||||||
# If the user provided columns to group the results by, use them:
|
# If the user provided columns to group the results by, use them:
|
||||||
if known_args.show is not None:
|
if known_args.show is not None:
|
||||||
show = known_args.show.split(",")
|
show = known_args.show.split(",")
|
||||||
|
|
@ -544,6 +548,14 @@ else:
|
||||||
show.remove(prop)
|
show.remove(prop)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Add plot_x parameter to parameters to show if it's not already present:
|
||||||
|
if known_args.plot:
|
||||||
|
for k, v in PRETTY_NAMES.items():
|
||||||
|
if v == known_args.plot_x and k not in show:
|
||||||
|
show.append(k)
|
||||||
|
break
|
||||||
|
|
||||||
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
||||||
|
|
||||||
if not rows_show:
|
if not rows_show:
|
||||||
|
|
@ -600,6 +612,161 @@ if "gpu_info" in show:
|
||||||
headers = [PRETTY_NAMES[p] for p in show]
|
headers = [PRETTY_NAMES[p] for p in show]
|
||||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||||
|
|
||||||
|
if known_args.plot:
|
||||||
|
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
|
||||||
|
try:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('Agg')
|
||||||
|
except ImportError as e:
|
||||||
|
logger.error("matplotlib is required for --plot.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
data_headers = headers[:-4] # Exclude the last 4 columns (Test, baseline t/s, compare t/s, Speedup)
|
||||||
|
plot_x_index = None
|
||||||
|
plot_x_label = plot_x_param
|
||||||
|
|
||||||
|
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
||||||
|
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||||
|
if pretty_name in data_headers:
|
||||||
|
plot_x_index = data_headers.index(pretty_name)
|
||||||
|
plot_x_label = pretty_name
|
||||||
|
elif plot_x_param in data_headers:
|
||||||
|
plot_x_index = data_headers.index(plot_x_param)
|
||||||
|
plot_x_label = plot_x_param
|
||||||
|
else:
|
||||||
|
logger.error(f"Parameter '{plot_x_param}' not found in current table columns. Available columns: {', '.join(data_headers)}")
|
||||||
|
return
|
||||||
|
|
||||||
|
grouped_data = {}
|
||||||
|
|
||||||
|
for i, row in enumerate(table_data):
|
||||||
|
group_key_parts = []
|
||||||
|
test_name = row[-4]
|
||||||
|
|
||||||
|
base_test = ""
|
||||||
|
x_value = None
|
||||||
|
|
||||||
|
if plot_x_param in ["n_prompt", "n_gen", "n_depth"]:
|
||||||
|
for j, val in enumerate(row[:-4]):
|
||||||
|
header_name = data_headers[j]
|
||||||
|
if val is not None and str(val).strip():
|
||||||
|
group_key_parts.append(f"{header_name}={val}")
|
||||||
|
|
||||||
|
if plot_x_param == "n_prompt" and "pp" in test_name:
|
||||||
|
base_test = test_name.split("@")[0]
|
||||||
|
x_value = base_test
|
||||||
|
elif plot_x_param == "n_gen" and "tg" in test_name:
|
||||||
|
x_value = test_name.split("@")[0]
|
||||||
|
elif plot_x_param == "n_depth" and "@d" in test_name:
|
||||||
|
base_test = test_name.split("@d")[0]
|
||||||
|
x_value = int(test_name.split("@d")[1])
|
||||||
|
else:
|
||||||
|
base_test = test_name
|
||||||
|
|
||||||
|
if base_test.strip():
|
||||||
|
group_key_parts.append(f"Test={base_test}")
|
||||||
|
else:
|
||||||
|
for j, val in enumerate(row[:-4]):
|
||||||
|
if j != plot_x_index:
|
||||||
|
header_name = data_headers[j]
|
||||||
|
if val is not None and str(val).strip():
|
||||||
|
group_key_parts.append(f"{header_name}={val}")
|
||||||
|
else:
|
||||||
|
x_value = val
|
||||||
|
|
||||||
|
group_key_parts.append(f"Test={test_name}")
|
||||||
|
|
||||||
|
group_key = tuple(group_key_parts)
|
||||||
|
|
||||||
|
if group_key not in grouped_data:
|
||||||
|
grouped_data[group_key] = []
|
||||||
|
|
||||||
|
grouped_data[group_key].append({
|
||||||
|
'x_value': x_value,
|
||||||
|
'baseline': float(row[-3]),
|
||||||
|
'compare': float(row[-2]),
|
||||||
|
'speedup': float(row[-1])
|
||||||
|
})
|
||||||
|
|
||||||
|
if not grouped_data:
|
||||||
|
logger.error("No data available for plotting")
|
||||||
|
return
|
||||||
|
|
||||||
|
def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
|
||||||
|
from math import ceil
|
||||||
|
cols = 1 if num_groups == 1 else min(max_cols, num_groups)
|
||||||
|
rows = ceil(num_groups / cols)
|
||||||
|
|
||||||
|
# Scale figure size by grid dimensions
|
||||||
|
w, h = base_size
|
||||||
|
fig, ax_arr = plt.subplots(rows, cols,
|
||||||
|
figsize=(w * cols, h * rows),
|
||||||
|
squeeze=False)
|
||||||
|
|
||||||
|
axes = ax_arr.flatten()[:num_groups]
|
||||||
|
return fig, axes
|
||||||
|
|
||||||
|
num_groups = len(grouped_data)
|
||||||
|
fig, axes = make_axes(num_groups)
|
||||||
|
|
||||||
|
plot_idx = 0
|
||||||
|
|
||||||
|
for group_key, points in grouped_data.items():
|
||||||
|
if plot_idx >= len(axes):
|
||||||
|
break
|
||||||
|
ax = axes[plot_idx]
|
||||||
|
|
||||||
|
try:
|
||||||
|
points_sorted = sorted(points, key=lambda p: float(p['x_value']) if p['x_value'] is not None else 0)
|
||||||
|
x_values = [float(p['x_value']) if p['x_value'] is not None else 0 for p in points_sorted]
|
||||||
|
except ValueError:
|
||||||
|
points_sorted = sorted(points, key=lambda p: group_key)
|
||||||
|
x_values = [p['x_value'] for p in points_sorted]
|
||||||
|
|
||||||
|
baseline_vals = [p['baseline'] for p in points_sorted]
|
||||||
|
compare_vals = [p['compare'] for p in points_sorted]
|
||||||
|
|
||||||
|
ax.plot(x_values, baseline_vals, 'o-', color='skyblue',
|
||||||
|
label=f'{baseline_name}', linewidth=2, markersize=6)
|
||||||
|
ax.plot(x_values, compare_vals, 's--', color='lightcoral', alpha=0.8,
|
||||||
|
label=f'{compare_name}', linewidth=2, markersize=6)
|
||||||
|
|
||||||
|
if log_scale:
|
||||||
|
ax.set_xscale('log', base=2)
|
||||||
|
unique_x = sorted(set(x_values))
|
||||||
|
ax.set_xticks(unique_x)
|
||||||
|
ax.set_xticklabels([str(int(x)) for x in unique_x])
|
||||||
|
|
||||||
|
title_parts = []
|
||||||
|
for part in group_key:
|
||||||
|
if '=' in part:
|
||||||
|
key, value = part.split('=', 1)
|
||||||
|
title_parts.append(f"{key}: {value}")
|
||||||
|
|
||||||
|
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
||||||
|
|
||||||
|
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
||||||
|
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
|
||||||
|
ax.set_title(title, fontsize=12, fontweight='bold')
|
||||||
|
ax.legend(loc='best', fontsize=10)
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
plot_idx += 1
|
||||||
|
|
||||||
|
for i in range(plot_idx, len(axes)):
|
||||||
|
axes[i].set_visible(False)
|
||||||
|
|
||||||
|
fig.suptitle(f'Performance comparison: {compare_name} vs. {baseline_name}',
|
||||||
|
fontsize=14, fontweight='bold')
|
||||||
|
fig.subplots_adjust(top=1)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
|
||||||
|
|
||||||
print(tabulate( # noqa: NP100
|
print(tabulate( # noqa: NP100
|
||||||
table,
|
table,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
6a7d170c04789f6ebcf320ed03c1b16973f93bd7
|
8cda0a3c19f2c7dc493887353c42f6956bc268b1
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_BERT, "bert" },
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||||
|
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
|
|
@ -73,6 +74,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||||
{ LLM_ARCH_PLM, "plm" },
|
{ LLM_ARCH_PLM, "plm" },
|
||||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||||
|
{ LLM_ARCH_DOTS1, "dots1" },
|
||||||
|
{ LLM_ARCH_ARCEE, "arcee" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -245,6 +248,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_ARCEE,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_LLAMA4,
|
LLM_ARCH_LLAMA4,
|
||||||
{
|
{
|
||||||
|
|
@ -496,6 +517,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_NEO_BERT,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||||
|
{ LLM_TENSOR_CLS, "cls" },
|
||||||
|
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
{
|
{
|
||||||
|
|
@ -1573,6 +1609,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_DOTS1,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_BERT,
|
LLM_ARCH_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT,
|
LLM_ARCH_NOMIC_BERT,
|
||||||
LLM_ARCH_NOMIC_BERT_MOE,
|
LLM_ARCH_NOMIC_BERT_MOE,
|
||||||
|
LLM_ARCH_NEO_BERT,
|
||||||
LLM_ARCH_JINA_BERT_V2,
|
LLM_ARCH_JINA_BERT_V2,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
|
|
@ -77,6 +78,8 @@ enum llm_arch {
|
||||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||||
LLM_ARCH_PLM,
|
LLM_ARCH_PLM,
|
||||||
LLM_ARCH_BAILINGMOE,
|
LLM_ARCH_BAILINGMOE,
|
||||||
|
LLM_ARCH_DOTS1,
|
||||||
|
LLM_ARCH_ARCEE,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,14 @@
|
||||||
#include "llama-batch.h"
|
#include "llama-batch.h"
|
||||||
|
|
||||||
|
#include "llama-impl.h"
|
||||||
|
#include "llama-cparams.h"
|
||||||
|
#include "llama-vocab.h"
|
||||||
|
#include "llama-memory.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
|
llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
|
||||||
// clear empty sequences
|
// clear empty sequences
|
||||||
|
|
@ -105,12 +111,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
|
||||||
ubatch.seq_id = batch->seq_id + seq.offset;
|
ubatch.seq_id = batch->seq_id + seq.offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (logits_all) {
|
if (batch->logits) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
|
||||||
ubatch.output[ubatch.n_tokens + i] = 1;
|
|
||||||
out_ids.push_back(ids[seq.offset + i]);
|
|
||||||
}
|
|
||||||
} else if (batch->logits) {
|
|
||||||
if (ubatch.equal_seqs) {
|
if (ubatch.equal_seqs) {
|
||||||
for (size_t i = 0; i < length; ++i) {
|
for (size_t i = 0; i < length; ++i) {
|
||||||
size_t id = ids[seq.offset + i];
|
size_t id = ids[seq.offset + i];
|
||||||
|
|
@ -197,11 +198,10 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||||
return ubatch;
|
return ubatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) {
|
||||||
GGML_ASSERT(batch.n_tokens >= 0);
|
GGML_ASSERT(batch.n_tokens >= 0);
|
||||||
this->batch = &batch;
|
this->batch = &batch;
|
||||||
this->n_embd = n_embd;
|
this->n_embd = n_embd;
|
||||||
this->logits_all = logits_all;
|
|
||||||
|
|
||||||
n_tokens = batch.n_tokens;
|
n_tokens = batch.n_tokens;
|
||||||
ids.resize(n_tokens);
|
ids.resize(n_tokens);
|
||||||
|
|
@ -285,17 +285,56 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
|
llama_batch_allocr::llama_batch_allocr() {
|
||||||
batch = in_batch;
|
const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
|
||||||
GGML_ASSERT(batch.n_tokens > 0);
|
debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
|
||||||
if (!batch.pos) {
|
|
||||||
assert(p0 >= 0);
|
seq_pos.resize(LLAMA_MAX_SEQ);
|
||||||
pos.resize(batch.n_tokens);
|
seq_cpl.resize(LLAMA_MAX_SEQ);
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
for (auto & cur : seq_cpl) {
|
||||||
pos[i] = p0 + i;
|
cur.resize(LLAMA_MAX_SEQ);
|
||||||
}
|
|
||||||
batch.pos = pos.data();
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_batch_allocr::init(
|
||||||
|
const llama_batch & batch_inp,
|
||||||
|
const llama_vocab & vocab,
|
||||||
|
const llama_memory_i * memory,
|
||||||
|
bool embd_all) {
|
||||||
|
clear();
|
||||||
|
|
||||||
|
batch = batch_inp;
|
||||||
|
|
||||||
|
GGML_ASSERT(batch.n_tokens > 0);
|
||||||
|
|
||||||
|
//
|
||||||
|
// validate input batch
|
||||||
|
//
|
||||||
|
|
||||||
|
if (batch.token) {
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
|
||||||
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (batch.seq_id) {
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// auto-generate missing fields
|
||||||
|
//
|
||||||
|
|
||||||
if (!batch.n_seq_id) {
|
if (!batch.n_seq_id) {
|
||||||
n_seq_id.resize(batch.n_tokens);
|
n_seq_id.resize(batch.n_tokens);
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||||
|
|
@ -303,6 +342,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||||
}
|
}
|
||||||
batch.n_seq_id = n_seq_id.data();
|
batch.n_seq_id = n_seq_id.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!batch.seq_id) {
|
if (!batch.seq_id) {
|
||||||
seq_id.resize(batch.n_tokens + 1);
|
seq_id.resize(batch.n_tokens + 1);
|
||||||
seq_id[batch.n_tokens] = NULL;
|
seq_id[batch.n_tokens] = NULL;
|
||||||
|
|
@ -311,10 +351,221 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||||
}
|
}
|
||||||
batch.seq_id = seq_id.data();
|
batch.seq_id = seq_id.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!batch.pos) {
|
||||||
|
pos.resize(batch.n_tokens);
|
||||||
|
|
||||||
|
// initialize the starting position for each sequence based on the positions in the memory
|
||||||
|
llama_pos p0[LLAMA_MAX_SEQ];
|
||||||
|
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
|
if (!memory) {
|
||||||
|
p0[s] = 0;
|
||||||
|
} else {
|
||||||
|
p0[s] = memory->seq_pos_max(s) + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||||
|
|
||||||
|
pos[i] = p0[seq_id];
|
||||||
|
|
||||||
|
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
p0[batch.seq_id[i][s]] = pos[i] + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
batch.pos = pos.data();
|
||||||
|
}
|
||||||
|
|
||||||
if (!batch.logits) {
|
if (!batch.logits) {
|
||||||
logits.resize(batch.n_tokens);
|
if (embd_all) {
|
||||||
logits[logits.size() - 1] = true;
|
// return the output for all tokens
|
||||||
batch.logits = logits.data();
|
output.resize(batch.n_tokens, true);
|
||||||
|
} else {
|
||||||
|
// return the output only for the last token
|
||||||
|
output.resize(batch.n_tokens, false);
|
||||||
|
output[output.size() - 1] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
batch.logits = output.data();
|
||||||
|
} else if (embd_all) {
|
||||||
|
bool warn = false;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
if (batch.logits[i] == 0) {
|
||||||
|
warn = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (warn) {
|
||||||
|
LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
|
||||||
|
|
||||||
|
output.resize(batch.n_tokens, true);
|
||||||
|
batch.logits = output.data();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// compute stats
|
||||||
|
//
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
n_outputs += batch.logits[i] != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// determine coupled sequences
|
||||||
|
// these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
seq_pos[batch.seq_id[i][s]].insert(batch.pos[i]);
|
||||||
|
|
||||||
|
if (s > 0) {
|
||||||
|
const llama_seq_id s0 = batch.seq_id[i][0];
|
||||||
|
const llama_seq_id s1 = batch.seq_id[i][s];
|
||||||
|
|
||||||
|
// mark that sequence s1 is coupled to s0
|
||||||
|
seq_cpl[s1][s0] = true;
|
||||||
|
|
||||||
|
// note: the other way around is not necessary for now
|
||||||
|
//seq_cpl[s0][s1] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug > 0) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
|
||||||
|
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, batch.n_tokens);
|
||||||
|
LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) batch.token);
|
||||||
|
LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) batch.embd);
|
||||||
|
LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) batch.pos);
|
||||||
|
LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) batch.n_seq_id);
|
||||||
|
LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) batch.seq_id);
|
||||||
|
LLAMA_LOG_DEBUG("%s: logits = %p\n", __func__, (void *) batch.logits);
|
||||||
|
LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs);
|
||||||
|
|
||||||
|
if (debug > 1) {
|
||||||
|
int seq_id_max = 0;
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++seq_id_max;
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: token = [\n", __func__);
|
||||||
|
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
std::vector<int8_t> seq_id(seq_id_max);
|
||||||
|
|
||||||
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||||
|
seq_id[batch.seq_id[i][s]] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
for (int s = 0; s < seq_id_max; ++s) {
|
||||||
|
if (seq_id[s]) {
|
||||||
|
ss << s%10;
|
||||||
|
} else {
|
||||||
|
ss << ".";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
|
||||||
|
__func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(),
|
||||||
|
batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]);
|
||||||
|
}
|
||||||
|
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: seq = [\n", __func__);
|
||||||
|
for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
|
||||||
|
if (seq_pos[s0].empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
for (int s1 = 0; s1 < (int) seq_cpl[s0].size(); ++s1) {
|
||||||
|
if (seq_cpl[s0][s1]) {
|
||||||
|
ss << s1 << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: %4d: pos = [%4d, %4d], cpl = %s\n",
|
||||||
|
__func__, s0, seq_pos_min(s0), seq_pos_max(s0), ss.str().empty() ? "-" : ss.str().c_str());
|
||||||
|
}
|
||||||
|
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// consistency checks
|
||||||
|
//
|
||||||
|
|
||||||
|
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
|
if (seq_pos[s].empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memory && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
|
||||||
|
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
||||||
|
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memory) {
|
||||||
|
for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
|
||||||
|
for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
|
||||||
|
if (seq_cpl[s0][s1]) {
|
||||||
|
if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
|
||||||
|
memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_batch & llama_batch_allocr::get_batch() const {
|
||||||
|
return batch;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t llama_batch_allocr::get_n_outputs() const {
|
||||||
|
return n_outputs;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_pos llama_batch_allocr::seq_pos_min(llama_seq_id seq_id) const {
|
||||||
|
return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
|
return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].rbegin();
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_batch_allocr::clear() {
|
||||||
|
n_outputs = 0;
|
||||||
|
|
||||||
|
batch = {};
|
||||||
|
pos.clear();
|
||||||
|
n_seq_id.clear();
|
||||||
|
seq_id.clear();
|
||||||
|
output.clear();
|
||||||
|
|
||||||
|
for (auto & cur : seq_pos) {
|
||||||
|
cur.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & cur : seq_cpl) {
|
||||||
|
std::fill(cur.begin(), cur.end(), false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
// very similar to llama_batch,
|
// very similar to llama_batch,
|
||||||
// but has more metadata about sequences
|
// but has more metadata about sequences
|
||||||
|
|
@ -18,8 +19,8 @@ struct llama_ubatch {
|
||||||
llama_token * token; // [n_tokens]
|
llama_token * token; // [n_tokens]
|
||||||
float * embd; // [n_embd, n_tokens]
|
float * embd; // [n_embd, n_tokens]
|
||||||
llama_pos * pos; // [n_tokens]
|
llama_pos * pos; // [n_tokens]
|
||||||
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
|
int32_t * n_seq_id; // [n_seqs]
|
||||||
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id;
|
llama_seq_id ** seq_id; // [n_seqs]
|
||||||
int8_t * output; // [n_tokens]
|
int8_t * output; // [n_tokens]
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -39,8 +40,6 @@ struct llama_sbatch {
|
||||||
|
|
||||||
size_t n_embd;
|
size_t n_embd;
|
||||||
|
|
||||||
bool logits_all; // TODO: remove once lctx.logits_all is removed too
|
|
||||||
|
|
||||||
// sorted indices into the batch
|
// sorted indices into the batch
|
||||||
std::vector<int64_t> ids;
|
std::vector<int64_t> ids;
|
||||||
// batch indices of the output
|
// batch indices of the output
|
||||||
|
|
@ -76,19 +75,45 @@ struct llama_sbatch {
|
||||||
llama_ubatch split_seq(size_t n_ubatch);
|
llama_ubatch split_seq(size_t n_ubatch);
|
||||||
|
|
||||||
llama_sbatch() = default;
|
llama_sbatch() = default;
|
||||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
|
||||||
};
|
};
|
||||||
|
|
||||||
// temporary allocate memory for the input batch if needed
|
// a helper for sanitizing and fulfilling a batch
|
||||||
struct llama_batch_allocr {
|
class llama_batch_allocr {
|
||||||
struct llama_batch batch;
|
public:
|
||||||
|
llama_batch_allocr();
|
||||||
|
|
||||||
|
// sanitize and auto-gen missing data in the input batch
|
||||||
|
// memory is optional. if provided will be used to check for sequence continuity and to determine the positions
|
||||||
|
bool init(
|
||||||
|
const llama_batch & batch_inp,
|
||||||
|
const llama_vocab & vocab,
|
||||||
|
const llama_memory_i * memory,
|
||||||
|
bool embd_all);
|
||||||
|
|
||||||
|
const llama_batch & get_batch() const;
|
||||||
|
|
||||||
|
uint32_t get_n_outputs() const;
|
||||||
|
|
||||||
|
llama_pos seq_pos_min(llama_seq_id seq_id) const;
|
||||||
|
llama_pos seq_pos_max(llama_seq_id seq_id) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
llama_batch batch;
|
||||||
|
|
||||||
|
uint32_t n_outputs;
|
||||||
|
|
||||||
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
||||||
|
|
||||||
std::vector<llama_pos> pos;
|
std::vector<llama_pos> pos;
|
||||||
std::vector<int32_t> n_seq_id;
|
std::vector<int32_t> n_seq_id;
|
||||||
std::vector<llama_seq_id *> seq_id;
|
std::vector<llama_seq_id *> seq_id;
|
||||||
std::vector<int8_t> logits;
|
std::vector<int8_t> output;
|
||||||
|
|
||||||
// optionally fulfill the batch returned by llama_batch_get_one
|
std::vector<std::set<llama_pos>> seq_pos; // seq_pos[s]: the set of positions in sequence s
|
||||||
llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
|
std::vector<std::vector<bool>> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
|
||||||
|
|
||||||
|
int debug;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -183,6 +183,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||||
return LLM_CHAT_TEMPLATE_BAILING;
|
return LLM_CHAT_TEMPLATE_BAILING;
|
||||||
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
||||||
return LLM_CHAT_TEMPLATE_LLAMA4;
|
return LLM_CHAT_TEMPLATE_LLAMA4;
|
||||||
|
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
||||||
|
return LLM_CHAT_TEMPLATE_DOTS1;
|
||||||
}
|
}
|
||||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
@ -331,7 +333,7 @@ int32_t llm_chat_apply_template(
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
||||||
system_prompt = trim(message->content);
|
system_prompt += trim(message->content);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// in gemma, "assistant" is "model"
|
// in gemma, "assistant" is "model"
|
||||||
|
|
@ -353,7 +355,7 @@ int32_t llm_chat_apply_template(
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
// there is no system message support, we will merge it with user prompt
|
// there is no system message support, we will merge it with user prompt
|
||||||
system_prompt = message->content;
|
system_prompt += message->content;
|
||||||
continue;
|
continue;
|
||||||
} else if (role == "user") {
|
} else if (role == "user") {
|
||||||
ss << "Human: ";
|
ss << "Human: ";
|
||||||
|
|
@ -643,6 +645,21 @@ int32_t llm_chat_apply_template(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "Assistant:";
|
ss << "Assistant:";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
|
||||||
|
// dots.llm1.inst (DOTS1)
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
ss << "<|system|>" << message->content << "<|endofsystem|>";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
|
||||||
|
} else {
|
||||||
|
ss << "<|response|>" << message->content << "<|endofresponse|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|response|>";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,7 @@ enum llm_chat_template {
|
||||||
LLM_CHAT_TEMPLATE_BAILING,
|
LLM_CHAT_TEMPLATE_BAILING,
|
||||||
LLM_CHAT_TEMPLATE_LLAMA4,
|
LLM_CHAT_TEMPLATE_LLAMA4,
|
||||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||||
|
LLM_CHAT_TEMPLATE_DOTS1,
|
||||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
#include "llama-batch.h"
|
||||||
#include "llama-io.h"
|
#include "llama-io.h"
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
|
|
@ -18,7 +19,8 @@
|
||||||
llama_context::llama_context(
|
llama_context::llama_context(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
llama_context_params params) :
|
llama_context_params params) :
|
||||||
model(model) {
|
model(model),
|
||||||
|
batch_allocr(std::make_unique<llama_batch_allocr>()) {
|
||||||
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
||||||
|
|
||||||
t_start_us = model.t_start_us;
|
t_start_us = model.t_start_us;
|
||||||
|
|
@ -27,8 +29,8 @@ llama_context::llama_context(
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||||
if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
|
if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
|
||||||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
|
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
|
||||||
}
|
}
|
||||||
|
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
|
|
@ -494,7 +496,7 @@ float * llama_context::get_logits() {
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_context::get_logits_ith(int32_t i) {
|
float * llama_context::get_logits_ith(int32_t i) {
|
||||||
int32_t j = -1;
|
int64_t j = -1;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (logits == nullptr) {
|
if (logits == nullptr) {
|
||||||
|
|
@ -517,7 +519,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||||
}
|
}
|
||||||
if (j >= n_outputs) {
|
if (j >= n_outputs) {
|
||||||
// This should not happen
|
// This should not happen
|
||||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||||
}
|
}
|
||||||
|
|
||||||
return logits + j*model.vocab.n_tokens();
|
return logits + j*model.vocab.n_tokens();
|
||||||
|
|
@ -536,7 +538,7 @@ float * llama_context::get_embeddings() {
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||||
int32_t j = -1;
|
int64_t j = -1;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (embd == nullptr) {
|
if (embd == nullptr) {
|
||||||
|
|
@ -559,7 +561,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||||
}
|
}
|
||||||
if (j >= n_outputs) {
|
if (j >= n_outputs) {
|
||||||
// This should not happen
|
// This should not happen
|
||||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||||
}
|
}
|
||||||
|
|
||||||
return embd + j*model.hparams.n_embd;
|
return embd + j*model.hparams.n_embd;
|
||||||
|
|
@ -719,52 +721,41 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_context::encode(llama_batch & inp_batch) {
|
int llama_context::encode(const llama_batch & batch_inp) {
|
||||||
if (inp_batch.n_tokens == 0) {
|
if (batch_inp.n_tokens == 0) {
|
||||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporary allocate memory for the input batch if needed
|
|
||||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
|
if (!batch_allocr->init(batch_inp, model.vocab, nullptr, true)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
const llama_batch & batch = batch_allocr.batch;
|
const llama_batch & batch = batch_allocr->get_batch();
|
||||||
const int32_t n_tokens = batch.n_tokens;
|
|
||||||
|
|
||||||
const auto & hparams = model.hparams;
|
const uint32_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||||
|
|
||||||
// TODO: move the validation to the llama_batch_allocr
|
|
||||||
if (batch.token) {
|
|
||||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
|
||||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
|
||||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
|
||||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
|
||||||
throw -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
||||||
GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
|
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||||
|
|
||||||
if (t_compute_start_us == 0) {
|
if (t_compute_start_us == 0) {
|
||||||
t_compute_start_us = ggml_time_us();
|
t_compute_start_us = ggml_time_us();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||||
embd_seq.clear();
|
embd_seq.clear();
|
||||||
|
|
||||||
n_queued_tokens += n_tokens;
|
n_queued_tokens += n_tokens;
|
||||||
|
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true);
|
||||||
|
|
||||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||||
|
|
||||||
|
|
@ -774,7 +765,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||||
return -2;
|
return -2;
|
||||||
};
|
};
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||||
output_ids[i] = i;
|
output_ids[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -830,7 +821,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||||
|
|
||||||
GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
|
GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens; i++) {
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -845,6 +837,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||||
auto & embd_seq_out = embd_seq;
|
auto & embd_seq_out = embd_seq;
|
||||||
const uint32_t n_cls_out = hparams.n_cls_out;
|
const uint32_t n_cls_out = hparams.n_cls_out;
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||||
|
|
@ -878,10 +871,10 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||||
|
|
||||||
// remember the sequence ids used during the encoding - needed for cross attention later
|
// remember the sequence ids used during the encoding - needed for cross attention later
|
||||||
cross.seq_ids_enc.resize(n_tokens);
|
cross.seq_ids_enc.resize(n_tokens);
|
||||||
for (int32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
cross.seq_ids_enc[i].clear();
|
cross.seq_ids_enc[i].clear();
|
||||||
for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
|
for (int s = 0; s < batch.n_seq_id[i]; s++) {
|
||||||
llama_seq_id seq_id = ubatch.seq_id[i][s];
|
llama_seq_id seq_id = batch.seq_id[i][s];
|
||||||
cross.seq_ids_enc[i].insert(seq_id);
|
cross.seq_ids_enc[i].insert(seq_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -890,51 +883,45 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_context::decode(llama_batch & inp_batch) {
|
int llama_context::decode(const llama_batch & batch_inp) {
|
||||||
if (!memory) {
|
if (!memory) {
|
||||||
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
||||||
return encode(inp_batch);
|
return encode(batch_inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (inp_batch.n_tokens == 0) {
|
if (batch_inp.n_tokens == 0) {
|
||||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!inp_batch.pos) {
|
// when computing embeddings, all tokens are output
|
||||||
if (inp_batch.seq_id) {
|
const bool embd_all = cparams.embeddings;
|
||||||
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
|
|
||||||
return -1;
|
if (!batch_allocr->init(batch_inp, model.vocab, memory.get(), embd_all)) {
|
||||||
}
|
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporary allocate memory for the input batch if needed
|
const llama_batch & batch = batch_allocr->get_batch();
|
||||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1);
|
|
||||||
|
|
||||||
const llama_batch & batch = batch_allocr.batch;
|
|
||||||
|
|
||||||
const auto & vocab = model.vocab;
|
const auto & vocab = model.vocab;
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int32_t n_vocab = vocab.n_tokens();
|
const int32_t n_vocab = vocab.n_tokens();
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
|
||||||
const int64_t n_tokens_all = batch.n_tokens;
|
const uint32_t n_tokens_all = batch.n_tokens;
|
||||||
const int64_t n_embd = hparams.n_embd;
|
|
||||||
|
|
||||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||||
|
|
||||||
// TODO: move the validation to the llama_batch_allocr
|
const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
|
||||||
if (batch.token) {
|
|
||||||
for (int64_t i = 0; i < n_tokens_all; ++i) {
|
|
||||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
|
||||||
LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
if (embd_all) {
|
||||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
// require that all tokens are output
|
||||||
return -1;
|
if (n_outputs_all != n_tokens_all) {
|
||||||
}
|
LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
|
||||||
|
__func__, n_outputs_all, n_tokens_all);
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -947,25 +934,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
}
|
}
|
||||||
n_queued_tokens += n_tokens_all;
|
n_queued_tokens += n_tokens_all;
|
||||||
|
|
||||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
|
||||||
|
|
||||||
embd_seq.clear();
|
embd_seq.clear();
|
||||||
|
|
||||||
int64_t n_outputs_all = 0;
|
|
||||||
|
|
||||||
// count outputs
|
|
||||||
if (batch.logits && !embd_pooled) {
|
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
|
||||||
n_outputs_all += batch.logits[i] != 0;
|
|
||||||
}
|
|
||||||
} else if (embd_pooled) {
|
|
||||||
n_outputs_all = n_tokens_all;
|
|
||||||
} else {
|
|
||||||
// keep last output only
|
|
||||||
n_outputs_all = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool did_optimize = false;
|
bool did_optimize = false;
|
||||||
|
|
||||||
// handle any pending defrags/shifts
|
// handle any pending defrags/shifts
|
||||||
|
|
@ -974,7 +945,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
llama_memory_state_ptr mstate;
|
llama_memory_state_ptr mstate;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
|
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_all);
|
||||||
if (!mstate) {
|
if (!mstate) {
|
||||||
return -2;
|
return -2;
|
||||||
}
|
}
|
||||||
|
|
@ -1018,7 +989,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
|
|
||||||
// reserve output buffer
|
// reserve output buffer
|
||||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||||
return -2;
|
return -2;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -1027,7 +998,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
do {
|
do {
|
||||||
const auto & ubatch = mstate->get_ubatch();
|
const auto & ubatch = mstate->get_ubatch();
|
||||||
|
|
||||||
// count the outputs in this u_batch
|
// count the outputs in this ubatch
|
||||||
{
|
{
|
||||||
int32_t n_outputs_new = 0;
|
int32_t n_outputs_new = 0;
|
||||||
|
|
||||||
|
|
@ -1052,18 +1023,19 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||||
llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
llama_pos pos_min[LLAMA_MAX_SEQ];
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: fix sequence indexing
|
||||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||||
const auto & seq_id = ubatch.seq_id[i][0];
|
const auto & seq_id = ubatch.seq_id[i][0];
|
||||||
|
|
||||||
pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
|
pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
|
if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -1086,7 +1058,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||||
//}
|
//}
|
||||||
|
|
||||||
auto * t_logits = cparams.embeddings ? nullptr : res->get_logits();
|
auto * t_logits = res->get_logits();
|
||||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||||
|
|
||||||
if (t_embd && res->get_embd_pooled()) {
|
if (t_embd && res->get_embd_pooled()) {
|
||||||
|
|
@ -1170,14 +1142,14 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
n_outputs = n_outputs_all;
|
n_outputs = n_outputs_all;
|
||||||
|
|
||||||
// set output mappings
|
// set output mappings
|
||||||
{
|
if (n_outputs > 0) {
|
||||||
bool sorted_output = true;
|
bool sorted_output = true;
|
||||||
|
|
||||||
auto & out_ids = mstate->out_ids();
|
auto & out_ids = mstate->out_ids();
|
||||||
|
|
||||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
|
GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
|
||||||
|
|
||||||
for (int64_t i = 0; i < n_outputs_all; ++i) {
|
for (int64_t i = 0; i < n_outputs; ++i) {
|
||||||
int64_t out_id = out_ids[i];
|
int64_t out_id = out_ids[i];
|
||||||
output_ids[out_id] = i;
|
output_ids[out_id] = i;
|
||||||
if (out_id != i) {
|
if (out_id != i) {
|
||||||
|
|
@ -1185,43 +1157,45 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
|
||||||
n_outputs = n_outputs_all;
|
|
||||||
|
|
||||||
// make the outputs have the same order they had in the user-provided batch
|
// make the outputs have the same order they had in the user-provided batch
|
||||||
// note: this is mostly relevant for recurrent models atm
|
// note: this is mostly relevant for recurrent models atm
|
||||||
if (!sorted_output) {
|
if (!sorted_output) {
|
||||||
|
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||||
|
const uint64_t n_embd = model.hparams.n_embd;
|
||||||
|
|
||||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||||
|
|
||||||
// TODO: is there something more efficient which also minimizes swaps?
|
// TODO: is there something more efficient which also minimizes swaps?
|
||||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
for (uint32_t i = 0; i < n_outputs - 1; ++i) {
|
||||||
int32_t j_min = i;
|
uint32_t j_min = i;
|
||||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
for (uint32_t j = i + 1; j < n_outputs; ++j) {
|
||||||
if (out_ids[j] < out_ids[j_min]) {
|
if (out_ids[j] < out_ids[j_min]) {
|
||||||
j_min = j;
|
j_min = j;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (j_min == i) { continue; }
|
if (j_min == i) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
std::swap(out_ids[i], out_ids[j_min]);
|
std::swap(out_ids[i], out_ids[j_min]);
|
||||||
if (logits_size > 0) {
|
if (logits_size > 0) {
|
||||||
for (int32_t k = 0; k < n_vocab; k++) {
|
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (embd_size > 0) {
|
if (embd_size > 0) {
|
||||||
for (int64_t k = 0; k < n_embd; k++) {
|
for (uint32_t k = 0; k < n_embd; k++) {
|
||||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
|
||||||
|
for (uint32_t i = 0; i < n_outputs; ++i) {
|
||||||
output_ids[out_ids[i]] = i;
|
output_ids[out_ids[i]] = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// sorted, so no need for the indices anymore
|
|
||||||
out_ids.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||||
|
|
@ -1238,7 +1212,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||||
// output
|
// output
|
||||||
//
|
//
|
||||||
|
|
||||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
const auto & vocab = model.vocab;
|
const auto & vocab = model.vocab;
|
||||||
|
|
||||||
|
|
@ -1248,9 +1222,8 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
const auto n_vocab = vocab.n_tokens();
|
const auto n_vocab = vocab.n_tokens();
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
||||||
// TODO: use a per-batch flag for logits presence instead
|
bool has_logits = true;
|
||||||
bool has_logits = !cparams.embeddings;
|
bool has_embd = cparams.embeddings;
|
||||||
bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
|
||||||
|
|
||||||
// TODO: hacky enc-dec support
|
// TODO: hacky enc-dec support
|
||||||
if (model.arch == LLM_ARCH_T5) {
|
if (model.arch == LLM_ARCH_T5) {
|
||||||
|
|
@ -1304,8 +1277,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
// set all ids as invalid (negative)
|
// set all ids as invalid (negative)
|
||||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||||
|
|
||||||
this->n_outputs = 0;
|
this->n_outputs = 0;
|
||||||
this->n_outputs_max = n_outputs_max;
|
|
||||||
|
|
||||||
return n_outputs_max;
|
return n_outputs_max;
|
||||||
}
|
}
|
||||||
|
|
@ -1334,7 +1306,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
|
|
||||||
if (n_tokens % n_seqs != 0) {
|
if (n_tokens % n_seqs != 0) {
|
||||||
n_tokens = (n_tokens / n_seqs) * n_seqs;
|
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||||
n_outputs = std::min(n_outputs, n_tokens);
|
n_outputs = std::min(n_outputs, n_tokens);
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
|
|
@ -1796,14 +1768,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||||
|
|
||||||
std::vector<int32_t> w_output_pos;
|
std::vector<int32_t> w_output_pos;
|
||||||
|
|
||||||
GGML_ASSERT(n_outputs <= n_outputs_max);
|
|
||||||
|
|
||||||
w_output_pos.resize(n_outputs);
|
w_output_pos.resize(n_outputs);
|
||||||
|
|
||||||
// build a more compact representation of the output ids
|
// build a more compact representation of the output ids
|
||||||
for (size_t i = 0; i < n_batch(); ++i) {
|
for (size_t i = 0; i < n_batch(); ++i) {
|
||||||
// map an output id to a position in the batch
|
// map an output id to a position in the batch
|
||||||
int32_t pos = output_ids[i];
|
int64_t pos = output_ids[i];
|
||||||
if (pos >= 0) {
|
if (pos >= 0) {
|
||||||
GGML_ASSERT(pos < n_outputs);
|
GGML_ASSERT(pos < n_outputs);
|
||||||
w_output_pos[pos] = i;
|
w_output_pos[pos] = i;
|
||||||
|
|
@ -2073,14 +2043,11 @@ void llama_context::opt_epoch_iter(
|
||||||
|
|
||||||
n_queued_tokens += n_tokens_all;
|
n_queued_tokens += n_tokens_all;
|
||||||
|
|
||||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
|
||||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
|
||||||
|
|
||||||
embd_seq.clear();
|
embd_seq.clear();
|
||||||
|
|
||||||
int64_t n_outputs_all = n_tokens_all;
|
uint32_t n_outputs_all = n_tokens_all;
|
||||||
|
|
||||||
auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true);
|
auto mstate = memory->init_batch(batch, cparams.n_ubatch, true);
|
||||||
if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
|
if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
|
||||||
LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
|
LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
|
||||||
break;
|
break;
|
||||||
|
|
@ -2088,7 +2055,7 @@ void llama_context::opt_epoch_iter(
|
||||||
|
|
||||||
// reserve output buffer
|
// reserve output buffer
|
||||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||||
GGML_ABORT("TODO: handle this error");
|
GGML_ABORT("TODO: handle this error");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-batch.h"
|
|
||||||
#include "llama-cparams.h"
|
#include "llama-cparams.h"
|
||||||
#include "llama-graph.h"
|
#include "llama-graph.h"
|
||||||
#include "llama-adapter.h"
|
#include "llama-adapter.h"
|
||||||
|
|
@ -13,6 +12,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct llama_model;
|
struct llama_model;
|
||||||
|
class llama_batch_allocr;
|
||||||
|
|
||||||
class llama_io_read_i;
|
class llama_io_read_i;
|
||||||
class llama_io_write_i;
|
class llama_io_write_i;
|
||||||
|
|
@ -102,8 +102,8 @@ struct llama_context {
|
||||||
llama_memory_state_i * mstate,
|
llama_memory_state_i * mstate,
|
||||||
ggml_status & ret);
|
ggml_status & ret);
|
||||||
|
|
||||||
int encode(llama_batch & inp_batch);
|
int encode(const llama_batch & batch_inp);
|
||||||
int decode(llama_batch & inp_batch);
|
int decode(const llama_batch & batch_inp);
|
||||||
|
|
||||||
//
|
//
|
||||||
// state save/load
|
// state save/load
|
||||||
|
|
@ -181,7 +181,7 @@ private:
|
||||||
|
|
||||||
// Make sure enough space is available for outputs.
|
// Make sure enough space is available for outputs.
|
||||||
// Returns max number of outputs for which space was reserved.
|
// Returns max number of outputs for which space was reserved.
|
||||||
int32_t output_reserve(int32_t n_outputs);
|
uint32_t output_reserve(int32_t n_outputs);
|
||||||
|
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
|
|
@ -246,8 +246,10 @@ private:
|
||||||
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||||
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||||
|
|
||||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
// reuse the batch_allocr to avoid unnecessary memory allocations
|
||||||
int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
|
std::unique_ptr<llama_batch_allocr> batch_allocr;
|
||||||
|
|
||||||
|
uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||||
|
|
||||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
#include "llama-cparams.h"
|
#include "llama-cparams.h"
|
||||||
|
|
||||||
size_t llama_max_parallel_sequences(void) {
|
size_t llama_max_parallel_sequences(void) {
|
||||||
return LLAMA_MAX_PARALLEL_SEQUENCES;
|
return LLAMA_MAX_SEQ;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
#define LLAMA_MAX_PARALLEL_SEQUENCES 64
|
#define LLAMA_MAX_SEQ 64
|
||||||
|
|
||||||
struct llama_cparams {
|
struct llama_cparams {
|
||||||
uint32_t n_ctx; // context size used during inference
|
uint32_t n_ctx; // context size used during inference
|
||||||
|
|
|
||||||
|
|
@ -139,6 +139,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||||
|
|
||||||
std::vector<uint64_t> sum(n_tokens, 0);
|
std::vector<uint64_t> sum(n_tokens, 0);
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < n_seqs; ++s) {
|
for (int s = 0; s < n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||||
|
|
||||||
|
|
@ -156,6 +157,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < n_seqs; ++s) {
|
for (int s = 0; s < n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||||
|
|
||||||
|
|
@ -180,6 +182,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||||
uint32_t * data = (uint32_t *) cls->data;
|
uint32_t * data = (uint32_t *) cls->data;
|
||||||
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
|
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < n_seqs; ++s) {
|
for (int s = 0; s < n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||||
|
|
||||||
|
|
@ -210,6 +213,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||||
std::vector<int> last_pos(n_tokens, -1);
|
std::vector<int> last_pos(n_tokens, -1);
|
||||||
std::vector<int> last_row(n_tokens, -1);
|
std::vector<int> last_row(n_tokens, -1);
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < n_seqs; ++s) {
|
for (int s = 0; s < n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||||
|
|
||||||
|
|
@ -283,6 +287,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||||
const int32_t ti = s0*n_seq_tokens + i;
|
const int32_t ti = s0*n_seq_tokens + i;
|
||||||
float f = -INFINITY;
|
float f = -INFINITY;
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
||||||
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
|
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
|
||||||
if (hparams.use_alibi) {
|
if (hparams.use_alibi) {
|
||||||
|
|
@ -322,6 +327,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||||
const int32_t ti = s0*n_seq_tokens + i;
|
const int32_t ti = s0*n_seq_tokens + i;
|
||||||
float f = -INFINITY;
|
float f = -INFINITY;
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
||||||
if (ubatch->seq_id[s0][s] == seq_id) {
|
if (ubatch->seq_id[s0][s] == seq_id) {
|
||||||
if (hparams.use_alibi) {
|
if (hparams.use_alibi) {
|
||||||
|
|
@ -377,6 +383,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
for (int j = 0; j < n_tokens; ++j) {
|
for (int j = 0; j < n_tokens; ++j) {
|
||||||
for (int i = 0; i < n_enc; ++i) {
|
for (int i = 0; i < n_enc; ++i) {
|
||||||
float f = -INFINITY;
|
float f = -INFINITY;
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
|
for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch->seq_id[j][s];
|
const llama_seq_id seq_id = ubatch->seq_id[j][s];
|
||||||
if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
|
if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
|
||||||
|
|
@ -1551,23 +1558,30 @@ void llm_graph_context::build_pooling(
|
||||||
ggml_tensor * inp_cls = build_inp_cls();
|
ggml_tensor * inp_cls = build_inp_cls();
|
||||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||||
|
|
||||||
if (cls != nullptr && cls_b != nullptr) {
|
if (cls) {
|
||||||
// classification head
|
// classification head
|
||||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
|
cur = ggml_mul_mat(ctx0, cls, inp);
|
||||||
|
if (cls_b) {
|
||||||
|
cur = ggml_add(ctx0, cur, cls_b);
|
||||||
|
}
|
||||||
cur = ggml_tanh(ctx0, cur);
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
|
||||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||||
if (cls_out) {
|
if (cls_out) {
|
||||||
GGML_ASSERT(cls_out_b != nullptr);
|
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
|
if (cls_out_b) {
|
||||||
|
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (cls_out) {
|
} else if (cls_out) {
|
||||||
// Single layer classification head (direct projection)
|
// Single layer classification head (direct projection)
|
||||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||||
GGML_ASSERT(cls_out_b != nullptr);
|
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
|
if (cls_out_b) {
|
||||||
|
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -378,7 +378,7 @@ struct llm_graph_params {
|
||||||
const llama_memory_state_i * mstate;
|
const llama_memory_state_i * mstate;
|
||||||
const llama_cross * cross;
|
const llama_cross * cross;
|
||||||
|
|
||||||
int32_t n_outputs;
|
uint32_t n_outputs;
|
||||||
|
|
||||||
const llm_graph_cb & cb;
|
const llm_graph_cb & cb;
|
||||||
};
|
};
|
||||||
|
|
@ -412,8 +412,8 @@ struct llm_graph_context {
|
||||||
const float norm_eps;
|
const float norm_eps;
|
||||||
const float norm_rms_eps;
|
const float norm_rms_eps;
|
||||||
|
|
||||||
const int32_t n_tokens;
|
const int64_t n_tokens;
|
||||||
const int32_t n_outputs;
|
const int64_t n_outputs;
|
||||||
const int32_t n_ctx_orig; // yarn
|
const int32_t n_ctx_orig; // yarn
|
||||||
|
|
||||||
const enum llama_pooling_type pooling_type;
|
const enum llama_pooling_type pooling_type;
|
||||||
|
|
|
||||||
|
|
@ -359,18 +359,16 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
|
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
|
||||||
GGML_UNUSED(embd_pooled);
|
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
|
||||||
|
|
||||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
|
|
||||||
|
|
||||||
std::vector<llama_ubatch> ubatches;
|
std::vector<llama_ubatch> ubatches;
|
||||||
|
|
||||||
while (sbatch.n_tokens > 0) {
|
while (sbatch.n_tokens > 0) {
|
||||||
llama_ubatch ubatch;
|
llama_ubatch ubatch;
|
||||||
|
|
||||||
if (embd_pooled) {
|
if (embd_all) {
|
||||||
// Pooled embeddings cannot be split across ubatches (yet)
|
// if all tokens are output, split by sequence
|
||||||
ubatch = sbatch.split_seq(n_ubatch);
|
ubatch = sbatch.split_seq(n_ubatch);
|
||||||
} else {
|
} else {
|
||||||
ubatch = sbatch.split_equal(n_ubatch);
|
ubatch = sbatch.split_equal(n_ubatch);
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,7 @@ public:
|
||||||
llama_memory_state_ptr init_batch(
|
llama_memory_state_ptr init_batch(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
bool embd_pooled,
|
bool embd_all) override;
|
||||||
bool logits_all) override;
|
|
||||||
|
|
||||||
llama_memory_state_ptr init_full() override;
|
llama_memory_state_ptr init_full() override;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,36 +95,69 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
return kv_swa->seq_pos_max(seq_id);
|
return kv_swa->seq_pos_max(seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
|
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
|
||||||
GGML_UNUSED(embd_pooled);
|
GGML_UNUSED(embd_all);
|
||||||
|
|
||||||
// TODO: if we fail with split_simple, we should attempt different splitting strategies
|
// first try simple split
|
||||||
|
do {
|
||||||
|
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
|
||||||
|
|
||||||
|
std::vector<llama_ubatch> ubatches;
|
||||||
|
|
||||||
|
while (sbatch.n_tokens > 0) {
|
||||||
|
auto ubatch = sbatch.split_simple(n_ubatch);
|
||||||
|
|
||||||
|
ubatches.push_back(ubatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto heads_base = kv_base->prepare(ubatches);
|
||||||
|
if (heads_base.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto heads_swa = kv_swa->prepare(ubatches);
|
||||||
|
if (heads_swa.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(heads_base.size() == heads_swa.size());
|
||||||
|
|
||||||
|
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||||
|
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||||
|
} while (false);
|
||||||
|
|
||||||
|
// if it fails, try equal split
|
||||||
|
do {
|
||||||
|
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
|
||||||
|
|
||||||
|
std::vector<llama_ubatch> ubatches;
|
||||||
|
|
||||||
|
while (sbatch.n_tokens > 0) {
|
||||||
|
auto ubatch = sbatch.split_equal(n_ubatch);
|
||||||
|
|
||||||
|
ubatches.push_back(ubatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto heads_base = kv_base->prepare(ubatches);
|
||||||
|
if (heads_base.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto heads_swa = kv_swa->prepare(ubatches);
|
||||||
|
if (heads_swa.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(heads_base.size() == heads_swa.size());
|
||||||
|
|
||||||
|
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||||
|
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||||
|
} while (false);
|
||||||
|
|
||||||
|
// TODO: if we fail again, we should attempt different splitting strategies
|
||||||
// but to do that properly, we first have to refactor the batches to be more flexible
|
// but to do that properly, we first have to refactor the batches to be more flexible
|
||||||
|
|
||||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||||
|
|
||||||
std::vector<llama_ubatch> ubatches;
|
|
||||||
|
|
||||||
while (sbatch.n_tokens > 0) {
|
|
||||||
auto ubatch = sbatch.split_simple(n_ubatch);
|
|
||||||
|
|
||||||
ubatches.push_back(ubatch);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto heads_base = kv_base->prepare(ubatches);
|
|
||||||
if (heads_base.empty()) {
|
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto heads_swa = kv_swa->prepare(ubatches);
|
|
||||||
if (heads_swa.empty()) {
|
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(heads_base.size() == heads_swa.size());
|
|
||||||
|
|
||||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
|
||||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
|
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
|
||||||
|
|
|
||||||
|
|
@ -34,8 +34,7 @@ public:
|
||||||
llama_memory_state_ptr init_batch(
|
llama_memory_state_ptr init_batch(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
bool embd_pooled,
|
bool embd_all) override;
|
||||||
bool logits_all) override;
|
|
||||||
|
|
||||||
llama_memory_state_ptr init_full() override;
|
llama_memory_state_ptr init_full() override;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||||
|
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified::clear(bool data) {
|
void llama_kv_cache_unified::clear(bool data) {
|
||||||
|
|
@ -307,24 +310,27 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
bool embd_pooled,
|
bool embd_all) {
|
||||||
bool logits_all) {
|
GGML_UNUSED(embd_all);
|
||||||
GGML_UNUSED(embd_pooled);
|
|
||||||
|
|
||||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
do {
|
||||||
|
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
|
||||||
|
|
||||||
std::vector<llama_ubatch> ubatches;
|
std::vector<llama_ubatch> ubatches;
|
||||||
while (sbatch.n_tokens > 0) {
|
while (sbatch.n_tokens > 0) {
|
||||||
ubatches.push_back(sbatch.split_simple(n_ubatch));
|
ubatches.push_back(sbatch.split_simple(n_ubatch));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto heads = prepare(ubatches);
|
auto heads = prepare(ubatches);
|
||||||
if (heads.empty()) {
|
if (heads.empty()) {
|
||||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_unique<llama_kv_cache_unified_state>(
|
return std::make_unique<llama_kv_cache_unified_state>(
|
||||||
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
||||||
|
} while (false);
|
||||||
|
|
||||||
|
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_memory_state_ptr llama_kv_cache_unified::init_full() {
|
llama_memory_state_ptr llama_kv_cache_unified::init_full() {
|
||||||
|
|
@ -517,36 +523,63 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
//#define FIND_SLOT_DEBUG 1
|
if (debug > 0) {
|
||||||
#if FIND_SLOT_DEBUG
|
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
|
||||||
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
|
|
||||||
|
|
||||||
// for debugging
|
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||||
{
|
std::string ss;
|
||||||
std::string ss;
|
|
||||||
if (n_swa > 0) {
|
|
||||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||||
if (cells.is_empty(i)) {
|
if (cells.is_empty(i)) {
|
||||||
ss += '.';
|
ss += '.';
|
||||||
} else {
|
} else {
|
||||||
ss += std::to_string(cells.seq_get(i));
|
assert(cells.seq_count(i) >= 1);
|
||||||
|
|
||||||
|
if (cells.seq_count(i) == 1) {
|
||||||
|
ss += std::to_string(cells.seq_get(i));
|
||||||
|
} else {
|
||||||
|
ss += 'M';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (i%256 == 255) {
|
if (i%256 == 255) {
|
||||||
|
ss += " *";
|
||||||
ss += '\n';
|
ss += '\n';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||||
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
|
||||||
if (cells.seq_pos_min(s) < 0) {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||||
|
std::string ss;
|
||||||
|
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||||
|
std::string cur;
|
||||||
|
if (cells.is_empty(i)) {
|
||||||
|
cur = '.';
|
||||||
|
} else {
|
||||||
|
cur = std::to_string(cells.pos_get(i));
|
||||||
|
}
|
||||||
|
const int n = cur.size();
|
||||||
|
for (int j = 0; j < 5 - n; ++j) {
|
||||||
|
cur += ' ';
|
||||||
|
}
|
||||||
|
ss += cur;
|
||||||
|
if (i%256 == 255) {
|
||||||
|
ss += " *";
|
||||||
|
}
|
||||||
|
if (i%64 == 63) {
|
||||||
|
ss += '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
|
if (cells.seq_pos_min(s) < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
uint32_t n_tested = 0;
|
uint32_t n_tested = 0;
|
||||||
|
|
||||||
|
|
@ -557,21 +590,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep track of what the minimum sequence positions would be if we accept the ubatch
|
|
||||||
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
|
||||||
seq_pos_min[s] = cells.seq_pos_min(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool found = true;
|
bool found = true;
|
||||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||||
const llama_pos pos = ubatch.pos[i];
|
//const llama_pos pos = ubatch.pos[i];
|
||||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
//const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||||
|
|
||||||
// can we use this cell? either:
|
// can we use this cell? either:
|
||||||
// - the cell is empty
|
// - the cell is empty
|
||||||
// - the cell is occupied only by one sequence:
|
// - the cell is occupied only by one sequence:
|
||||||
// - mask causally, if the sequence is the same as the one we are inserting
|
// - (disabled) mask causally, if the sequence is the same as the one we are inserting
|
||||||
// - mask SWA, using current max pos for that sequence in the cache
|
// - mask SWA, using current max pos for that sequence in the cache
|
||||||
// always insert in the cell with minimum pos
|
// always insert in the cell with minimum pos
|
||||||
bool can_use = cells.is_empty(head_cur + i);
|
bool can_use = cells.is_empty(head_cur + i);
|
||||||
|
|
@ -579,21 +606,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
||||||
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
||||||
|
|
||||||
// causal mask
|
// (disabled) causal mask
|
||||||
if (cells.seq_has(head_cur + i, seq_id)) {
|
// note: it's better to purge any "future" tokens beforehand
|
||||||
can_use = pos_cell >= pos;
|
//if (cells.seq_has(head_cur + i, seq_id)) {
|
||||||
}
|
// can_use = pos_cell >= pos;
|
||||||
|
//}
|
||||||
|
|
||||||
if (!can_use) {
|
if (!can_use) {
|
||||||
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
||||||
|
|
||||||
// SWA mask
|
// SWA mask
|
||||||
// note: we insert only in the cell with minimum pos in order to preserve the invariant that
|
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||||
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
|
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
|
||||||
if (pos_cell == seq_pos_min[seq_id_cell] &&
|
|
||||||
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
|
||||||
seq_pos_min[seq_id_cell]++;
|
|
||||||
can_use = true;
|
can_use = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -621,18 +644,58 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
||||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
if (debug > 0) {
|
||||||
if (!cells.is_empty(head_cur + i)) {
|
LLAMA_LOG_DEBUG("%s: ubatch info:\n", __func__);
|
||||||
cells.rm(head_cur + i);
|
LLAMA_LOG_DEBUG("%s: n_tokens = %d, equal_seqs = %d\n", __func__, ubatch.n_tokens, ubatch.equal_seqs);
|
||||||
}
|
LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d, n_seqs = %d\n", __func__, ubatch.n_seq_tokens, ubatch.n_seqs);
|
||||||
|
}
|
||||||
|
|
||||||
cells.pos_set(head_cur + i, ubatch.pos[i]);
|
// keep track of the max sequence position that we would overwrite with this ubatch
|
||||||
|
// for non-SWA cache, this would be always empty
|
||||||
|
llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
|
||||||
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
|
seq_pos_max_rm[s] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
|
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||||
cells.seq_add(head_cur + i, ubatch.seq_id[i][j]);
|
for (uint32_t j = 0; j < ubatch.n_seq_tokens; ++j) {
|
||||||
|
const uint32_t idx = s*ubatch.n_seq_tokens + j;
|
||||||
|
|
||||||
|
if (!cells.is_empty(head_cur + idx)) {
|
||||||
|
assert(cells.seq_count(head_cur + idx) == 1);
|
||||||
|
|
||||||
|
const llama_seq_id seq_id = cells.seq_get(head_cur + idx);
|
||||||
|
const llama_pos pos = cells.pos_get(head_cur + idx);
|
||||||
|
|
||||||
|
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
|
||||||
|
|
||||||
|
cells.rm(head_cur + idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
cells.pos_set(head_cur + idx, ubatch.pos[idx]);
|
||||||
|
|
||||||
|
// TODO: fix indexing [UBATCH_IDX]
|
||||||
|
for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) {
|
||||||
|
cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
|
||||||
|
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||||
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
|
if (seq_pos_max_rm[s] == -1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
|
||||||
|
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
|
||||||
|
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
|
||||||
|
|
||||||
|
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
// move the head at the end of the slot
|
// move the head at the end of the slot
|
||||||
head = head_cur + ubatch.n_tokens;
|
head = head_cur + ubatch.n_tokens;
|
||||||
}
|
}
|
||||||
|
|
@ -729,14 +792,14 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const uint32_t n_tokens = ubatch->n_tokens;
|
||||||
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
const uint32_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||||
const int64_t n_seqs = ubatch->n_seqs;
|
const uint32_t n_seqs = ubatch->n_seqs;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||||
float * data = (float *) dst->data;
|
float * data = (float *) dst->data;
|
||||||
|
|
||||||
const auto n_kv = dst->ne[0];
|
const int64_t n_kv = dst->ne[0];
|
||||||
|
|
||||||
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
||||||
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
||||||
|
|
@ -750,12 +813,14 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||||
// xxxxx-----
|
// xxxxx-----
|
||||||
// xxxxx-----
|
// xxxxx-----
|
||||||
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
||||||
for (int h = 0; h < 1; ++h) {
|
for (uint32_t h = 0; h < 1; ++h) {
|
||||||
for (int s = 0; s < n_seqs; ++s) {
|
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||||
|
|
||||||
for (int j = 0; j < n_seq_tokens; ++j) {
|
for (uint32_t j = 0; j < n_seq_tokens; ++j) {
|
||||||
const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
|
const uint32_t idx = s*n_seq_tokens + j;
|
||||||
|
|
||||||
|
const llama_pos p1 = ubatch->pos[idx];
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||||
float f = 0.0f;
|
float f = 0.0f;
|
||||||
|
|
@ -785,16 +850,16 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||||
f = -INFINITY;
|
f = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
data[h*(n_kv*n_tokens) + idx*n_kv + i] = f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// mask padded tokens
|
// mask padded tokens
|
||||||
if (data) {
|
if (data) {
|
||||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
for (uint32_t j = n_tokens; j < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++j) {
|
||||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||||
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1445,9 +1510,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||||
seq_rm(dest_seq_id, -1, -1);
|
seq_rm(dest_seq_id, -1, -1);
|
||||||
|
|
||||||
llama_sbatch sbatch;
|
llama_sbatch sbatch;
|
||||||
llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
llama_ubatch ubatch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
||||||
|
|
||||||
batch.n_tokens = cell_count;
|
ubatch.n_tokens = cell_count;
|
||||||
|
ubatch.n_seq_tokens = cell_count;
|
||||||
|
ubatch.n_seqs = 1;
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
llama_pos pos;
|
llama_pos pos;
|
||||||
|
|
@ -1467,18 +1534,18 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||||
io.read_to(&seq_id, sizeof(seq_id));
|
io.read_to(&seq_id, sizeof(seq_id));
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.pos[i] = pos;
|
ubatch.pos[i] = pos;
|
||||||
batch.n_seq_id[i] = n_seq_id;
|
ubatch.n_seq_id[i] = n_seq_id;
|
||||||
batch.seq_id[i] = &dest_seq_id;
|
ubatch.seq_id[i] = &dest_seq_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto head_cur = find_slot(batch);
|
const auto head_cur = find_slot(ubatch);
|
||||||
if (head_cur < 0) {
|
if (head_cur < 0) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
apply_ubatch(head_cur, batch);
|
apply_ubatch(head_cur, ubatch);
|
||||||
|
|
||||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
||||||
head = head_cur;
|
head = head_cur;
|
||||||
|
|
@ -1486,8 +1553,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||||
// Assume that this is one contiguous block of cells
|
// Assume that this is one contiguous block of cells
|
||||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
||||||
GGML_ASSERT(cells.pos_get(head_cur) == batch.pos[0]);
|
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
||||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]);
|
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
||||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
||||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1672,7 +1739,7 @@ llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||||
llama_context * lctx,
|
llama_context * lctx,
|
||||||
bool do_shift,
|
bool do_shift,
|
||||||
defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
|
defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
|
||||||
if (!do_shift && dinfo.empty()) {
|
if (!do_shift && this->dinfo.empty()) {
|
||||||
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -59,8 +59,7 @@ public:
|
||||||
llama_memory_state_ptr init_batch(
|
llama_memory_state_ptr init_batch(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
bool embd_pooled,
|
bool embd_all) override;
|
||||||
bool logits_all) override;
|
|
||||||
|
|
||||||
llama_memory_state_ptr init_full() override;
|
llama_memory_state_ptr init_full() override;
|
||||||
|
|
||||||
|
|
@ -158,6 +157,8 @@ private:
|
||||||
// SWA
|
// SWA
|
||||||
const uint32_t n_swa = 0;
|
const uint32_t n_swa = 0;
|
||||||
|
|
||||||
|
int debug = 0;
|
||||||
|
|
||||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ public:
|
||||||
|
|
||||||
used.clear();
|
used.clear();
|
||||||
|
|
||||||
for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
seq_pos[s].clear();
|
seq_pos[s].clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -240,7 +240,7 @@ public:
|
||||||
llama_seq_id seq_get(uint32_t i) const {
|
llama_seq_id seq_get(uint32_t i) const {
|
||||||
assert(seq[i].count() == 1);
|
assert(seq[i].count() == 1);
|
||||||
|
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
if (seq[i].test(s)) {
|
if (seq[i].test(s)) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
@ -253,7 +253,7 @@ public:
|
||||||
// return -1 if the sequence is not present
|
// return -1 if the sequence is not present
|
||||||
llama_pos seq_pos_min(llama_seq_id seq_id) const {
|
llama_pos seq_pos_min(llama_seq_id seq_id) const {
|
||||||
assert(seq_id >= 0);
|
assert(seq_id >= 0);
|
||||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
assert(seq_id < LLAMA_MAX_SEQ);
|
||||||
|
|
||||||
if (seq_pos[seq_id].empty()) {
|
if (seq_pos[seq_id].empty()) {
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -266,7 +266,7 @@ public:
|
||||||
// return -1 if the sequence is not present
|
// return -1 if the sequence is not present
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const {
|
llama_pos seq_pos_max(llama_seq_id seq_id) const {
|
||||||
assert(seq_id >= 0);
|
assert(seq_id >= 0);
|
||||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
assert(seq_id < LLAMA_MAX_SEQ);
|
||||||
|
|
||||||
if (seq_pos[seq_id].empty()) {
|
if (seq_pos[seq_id].empty()) {
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -384,20 +384,20 @@ private:
|
||||||
//
|
//
|
||||||
std::vector<llama_pos> shift;
|
std::vector<llama_pos> shift;
|
||||||
|
|
||||||
using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
|
using bits_t = std::bitset<LLAMA_MAX_SEQ>;
|
||||||
|
|
||||||
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
|
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
|
||||||
std::vector<bits_t> seq;
|
std::vector<bits_t> seq;
|
||||||
|
|
||||||
// the set seq_pos[s] tells us which positions are currently present for sequence s
|
// the set seq_pos[s] tells us which positions are currently present for sequence s
|
||||||
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
|
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
|
||||||
std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
|
std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
|
||||||
|
|
||||||
// helper functions for updating `seq_pos`, once cell at a time:
|
// helper functions for updating `seq_pos`, once cell at a time:
|
||||||
|
|
||||||
// remove cell i
|
// remove cell i
|
||||||
void seq_pos_rm(uint32_t i) {
|
void seq_pos_rm(uint32_t i) {
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
if (seq[i].test(s)) {
|
if (seq[i].test(s)) {
|
||||||
seq_pos[s].erase(pos[i]);
|
seq_pos[s].erase(pos[i]);
|
||||||
}
|
}
|
||||||
|
|
@ -406,7 +406,7 @@ private:
|
||||||
|
|
||||||
// add cell i
|
// add cell i
|
||||||
void seq_pos_add(uint32_t i) {
|
void seq_pos_add(uint32_t i) {
|
||||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||||
if (seq[i].test(s)) {
|
if (seq[i].test(s)) {
|
||||||
seq_pos[s].insert(pos[i]);
|
seq_pos[s].insert(pos[i]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -73,8 +73,7 @@ struct llama_memory_i {
|
||||||
virtual llama_memory_state_ptr init_batch(
|
virtual llama_memory_state_ptr init_batch(
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
uint32_t n_ubatch,
|
uint32_t n_ubatch,
|
||||||
bool embd_pooled,
|
bool embd_all) = 0;
|
||||||
bool logits_all) = 0;
|
|
||||||
|
|
||||||
// simulate full cache, used for allocating worst-case compute buffers
|
// simulate full cache, used for allocating worst-case compute buffers
|
||||||
virtual llama_memory_state_ptr init_full() = 0;
|
virtual llama_memory_state_ptr init_full() = 0;
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_40B: return "40B";
|
case LLM_TYPE_40B: return "40B";
|
||||||
case LLM_TYPE_65B: return "65B";
|
case LLM_TYPE_65B: return "65B";
|
||||||
case LLM_TYPE_70B: return "70B";
|
case LLM_TYPE_70B: return "70B";
|
||||||
|
case LLM_TYPE_142B: return "142B";
|
||||||
case LLM_TYPE_236B: return "236B";
|
case LLM_TYPE_236B: return "236B";
|
||||||
case LLM_TYPE_290B: return "290B";
|
case LLM_TYPE_290B: return "290B";
|
||||||
case LLM_TYPE_314B: return "314B";
|
case LLM_TYPE_314B: return "314B";
|
||||||
|
|
@ -602,6 +603,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.use_kq_norm = false;
|
hparams.use_kq_norm = false;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_ARCEE:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
// Arcee uses the same structure as Llama
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 36: type = LLM_TYPE_4B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
@ -742,6 +753,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_NEO_BERT:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||||
|
|
||||||
|
if (hparams.n_layer == 28) {
|
||||||
|
type = LLM_TYPE_250M;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
|
@ -1480,6 +1501,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DOTS1:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 62: type = LLM_TYPE_142B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: throw std::runtime_error("unsupported model architecture");
|
default: throw std::runtime_error("unsupported model architecture");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2223,6 +2258,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_NEO_BERT:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||||
|
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_JINA_BERT_V2:
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
||||||
|
|
@ -4207,6 +4268,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DOTS1:
|
||||||
|
{
|
||||||
|
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||||
|
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||||
|
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
if (i < (int) hparams.n_layer_dense_lead) {
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
} else {
|
||||||
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
if (n_expert == 0) {
|
||||||
|
throw std::runtime_error("n_expert must be > 0");
|
||||||
|
}
|
||||||
|
if (n_expert_used == 0) {
|
||||||
|
throw std::runtime_error("n_expert_used must be > 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
|
||||||
|
// Shared expert branch
|
||||||
|
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||||
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||||
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_ARCEE:
|
||||||
|
{
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||||
|
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
|
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
|
@ -6162,6 +6306,117 @@ struct llm_build_bert : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_neo_bert : public llm_graph_context {
|
||||||
|
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// construct input embeddings (token, type, position)
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_no_cache();
|
||||||
|
|
||||||
|
// iterate layers
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * cur = inpL;
|
||||||
|
|
||||||
|
ggml_tensor * Qcur;
|
||||||
|
ggml_tensor * Kcur;
|
||||||
|
ggml_tensor * Vcur;
|
||||||
|
|
||||||
|
// pre-norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||||
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||||
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
// RoPE
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, gf,
|
||||||
|
model.layers[il].wo, nullptr,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
|
||||||
|
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
// re-add the layer input
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
|
ggml_tensor * ffn_inp = cur;
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// pre-norm
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up,
|
||||||
|
NULL, NULL, NULL, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down,
|
||||||
|
NULL, NULL, NULL,
|
||||||
|
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
||||||
|
|
||||||
|
// attentions bypass the intermediate layer
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm_enc, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_embd", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_bloom : public llm_graph_context {
|
struct llm_build_bloom : public llm_graph_context {
|
||||||
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
@ -13421,6 +13676,291 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_dots1 : public llm_graph_context {
|
||||||
|
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self_attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
cb(Qcur, "Qcur_normed", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
cb(Kcur, "Kcur_normed", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
} else {
|
||||||
|
ggml_tensor * moe_out =
|
||||||
|
build_moe_ffn(cur,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
model.layers[il].ffn_gate_exps,
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
model.layers[il].ffn_exp_probs_b,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||||
|
true, hparams.expert_weights_scale,
|
||||||
|
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||||
|
il);
|
||||||
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
{
|
||||||
|
ggml_tensor * ffn_shexp = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(ffn_shexp, "ffn_shexp", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llm_build_arcee : public llm_graph_context {
|
||||||
|
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_kv_unified();
|
||||||
|
|
||||||
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
// ARCEE uses relu^2 instead of silu
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
NULL, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||||
llama_memory_i * res;
|
llama_memory_i * res;
|
||||||
|
|
||||||
|
|
@ -13429,6 +13969,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
case LLM_ARCH_JINA_BERT_V2:
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
case LLM_ARCH_NOMIC_BERT:
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
|
case LLM_ARCH_NEO_BERT:
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
{
|
{
|
||||||
res = nullptr;
|
res = nullptr;
|
||||||
|
|
@ -13538,6 +14079,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_NEO_BERT:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
||||||
|
|
@ -13761,6 +14306,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DOTS1:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_dots1>(*this, params, gf);
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_ARCEE:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
@ -13911,6 +14464,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
case LLM_ARCH_CHAMELEON:
|
case LLM_ARCH_CHAMELEON:
|
||||||
case LLM_ARCH_BAILINGMOE:
|
case LLM_ARCH_BAILINGMOE:
|
||||||
|
case LLM_ARCH_NEO_BERT:
|
||||||
|
case LLM_ARCH_ARCEE:
|
||||||
return LLAMA_ROPE_TYPE_NORM;
|
return LLAMA_ROPE_TYPE_NORM;
|
||||||
|
|
||||||
// the pairs of head values are offset by n_rot/2
|
// the pairs of head values are offset by n_rot/2
|
||||||
|
|
@ -13944,6 +14499,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_NEMOTRON:
|
case LLM_ARCH_NEMOTRON:
|
||||||
case LLM_ARCH_EXAONE:
|
case LLM_ARCH_EXAONE:
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
|
case LLM_ARCH_DOTS1:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
case LLM_ARCH_QWEN2VL:
|
case LLM_ARCH_QWEN2VL:
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,7 @@ enum llm_type {
|
||||||
LLM_TYPE_40B,
|
LLM_TYPE_40B,
|
||||||
LLM_TYPE_65B,
|
LLM_TYPE_65B,
|
||||||
LLM_TYPE_70B,
|
LLM_TYPE_70B,
|
||||||
|
LLM_TYPE_142B,
|
||||||
LLM_TYPE_236B,
|
LLM_TYPE_236B,
|
||||||
LLM_TYPE_290B,
|
LLM_TYPE_290B,
|
||||||
LLM_TYPE_314B,
|
LLM_TYPE_314B,
|
||||||
|
|
|
||||||
|
|
@ -585,7 +585,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
||||||
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
||||||
gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
|
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
||||||
|
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
||||||
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
||||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
||||||
|
|
|
||||||
|
|
@ -9,16 +9,16 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cctype>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <climits>
|
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <forward_list>
|
#include <forward_list>
|
||||||
|
#include <limits>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <queue>
|
#include <queue>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <cctype>
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// helpers
|
// helpers
|
||||||
|
|
@ -1987,6 +1987,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
|| t.first == "<|eom_id|>"
|
|| t.first == "<|eom_id|>"
|
||||||
|| t.first == "<EOT>"
|
|| t.first == "<EOT>"
|
||||||
|| t.first == "_<EOT>"
|
|| t.first == "_<EOT>"
|
||||||
|
|| t.first == "<|end_of_text|>"
|
||||||
) {
|
) {
|
||||||
special_eog_ids.insert(t.second);
|
special_eog_ids.insert(t.second);
|
||||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
|
@ -2572,6 +2573,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||||
// copy piece chars to output text buffer
|
// copy piece chars to output text buffer
|
||||||
// skip up to 'lstrip' leading spaces before copying
|
// skip up to 'lstrip' leading spaces before copying
|
||||||
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
||||||
|
if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
||||||
|
GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
|
||||||
|
}
|
||||||
|
|
||||||
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
||||||
token++;
|
token++;
|
||||||
size--;
|
size--;
|
||||||
|
|
@ -2768,26 +2773,26 @@ void llama_vocab::impl::print_info() const {
|
||||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
|
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
||||||
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
|
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
||||||
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
|
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
||||||
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
|
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
||||||
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
|
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
||||||
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
|
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
||||||
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
|
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
||||||
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
|
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
||||||
|
|
||||||
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
|
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
||||||
|
|
||||||
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
|
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
||||||
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
|
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
||||||
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
|
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
||||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
|
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
||||||
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
|
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
||||||
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
|
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
||||||
|
|
||||||
for (const auto & id : special_eog_ids) {
|
for (const auto & id : special_eog_ids) {
|
||||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
|
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
||||||
|
|
|
||||||
|
|
@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
|
|
||||||
// if using single GPU mode, remove all except the main GPU
|
// if using single GPU mode, remove all except the main GPU
|
||||||
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||||
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
|
if (params.main_gpu < 0) {
|
||||||
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
|
model->devices.clear();
|
||||||
llama_model_free(model);
|
} else {
|
||||||
return nullptr;
|
if (params.main_gpu >= (int)model->devices.size()) {
|
||||||
|
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
|
||||||
|
llama_model_free(model);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
||||||
|
model->devices.clear();
|
||||||
|
model->devices.push_back(main_gpu);
|
||||||
}
|
}
|
||||||
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
|
||||||
model->devices.clear();
|
|
||||||
model->devices.push_back(main_gpu);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto * dev : model->devices) {
|
for (auto * dev : model->devices) {
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,34 @@ function(llama_test target)
|
||||||
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
|
function(llama_test_cmd target)
|
||||||
|
include(CMakeParseArguments)
|
||||||
|
set(options)
|
||||||
|
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
||||||
|
set(multiValueArgs ARGS)
|
||||||
|
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||||
|
|
||||||
|
if (NOT DEFINED LLAMA_TEST_LABEL)
|
||||||
|
set(LLAMA_TEST_LABEL "main")
|
||||||
|
endif()
|
||||||
|
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
|
||||||
|
set(LLAMA_TEST_WORKING_DIRECTORY .)
|
||||||
|
endif()
|
||||||
|
if (DEFINED LLAMA_TEST_NAME)
|
||||||
|
set(TEST_NAME ${LLAMA_TEST_NAME})
|
||||||
|
else()
|
||||||
|
set(TEST_NAME ${target})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_test(
|
||||||
|
NAME ${TEST_NAME}
|
||||||
|
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
||||||
|
COMMAND ${target}
|
||||||
|
${LLAMA_TEST_ARGS})
|
||||||
|
|
||||||
|
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||||
|
endfunction()
|
||||||
|
|
||||||
# Builds and runs a test source file.
|
# Builds and runs a test source file.
|
||||||
# Optional args:
|
# Optional args:
|
||||||
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
||||||
|
|
@ -83,25 +111,31 @@ endfunction()
|
||||||
# build test-tokenizer-0 target once and add many tests
|
# build test-tokenizer-0 target once and add many tests
|
||||||
llama_build(test-tokenizer-0.cpp)
|
llama_build(test-tokenizer-0.cpp)
|
||||||
|
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
|
||||||
|
|
||||||
# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
|
if (NOT WIN32)
|
||||||
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
|
llama_test_cmd(
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
|
||||||
|
NAME test-tokenizers-ggml-vocabs
|
||||||
|
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
|
||||||
|
ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_LLGUIDANCE)
|
if (LLAMA_LLGUIDANCE)
|
||||||
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||||
|
|
@ -113,8 +147,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||||
llama_build_and_test(test-chat.cpp)
|
llama_build_and_test(test-chat.cpp)
|
||||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
|
target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT GGML_BACKEND_DL)
|
if (NOT GGML_BACKEND_DL)
|
||||||
|
|
@ -127,20 +161,20 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||||
llama_build(test-tokenizer-1-bpe.cpp)
|
llama_build(test-tokenizer-1-bpe.cpp)
|
||||||
|
|
||||||
# TODO: disabled due to slowness
|
# TODO: disabled due to slowness
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
|
||||||
|
|
||||||
# build test-tokenizer-1-spm target once and add many tests
|
# build test-tokenizer-1-spm target once and add many tests
|
||||||
llama_build(test-tokenizer-1-spm.cpp)
|
llama_build(test-tokenizer-1-spm.cpp)
|
||||||
|
|
||||||
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
|
||||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
# llama_build_and_test(test-double-float.cpp) # SLOW
|
# llama_build_and_test(test-double-float.cpp) # SLOW
|
||||||
endif()
|
endif()
|
||||||
|
|
@ -151,6 +185,8 @@ llama_build_and_test(test-json-partial.cpp)
|
||||||
llama_build_and_test(test-log.cpp)
|
llama_build_and_test(test-log.cpp)
|
||||||
llama_build_and_test(test-regex-partial.cpp)
|
llama_build_and_test(test-regex-partial.cpp)
|
||||||
|
|
||||||
|
llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)
|
||||||
|
|
||||||
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
|
||||||
if (NOT WIN32)
|
if (NOT WIN32)
|
||||||
llama_build_and_test(test-arg-parser.cpp)
|
llama_build_and_test(test-arg-parser.cpp)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,152 @@
|
||||||
|
// thread safety test
|
||||||
|
// - Loads a copy of the same model on each GPU, plus a copy on the CPU
|
||||||
|
// - Creates n_parallel (--parallel) contexts per model
|
||||||
|
// - Runs inference in parallel on each context
|
||||||
|
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
#include <atomic>
|
||||||
|
#include "llama.h"
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
|
|
||||||
|
//llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||||
|
// if (level == GGML_LOG_LEVEL_ERROR) {
|
||||||
|
// common_log_add(common_log_main(), level, "%s", text);
|
||||||
|
// }
|
||||||
|
//}, NULL);
|
||||||
|
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
int dev_count = ggml_backend_dev_count();
|
||||||
|
int gpu_dev_count = 0;
|
||||||
|
for (int i = 0; i < dev_count; ++i) {
|
||||||
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
|
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||||
|
gpu_dev_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
|
||||||
|
//const int num_models = std::max(1, gpu_dev_count);
|
||||||
|
const int num_contexts = std::max(1, params.n_parallel);
|
||||||
|
|
||||||
|
std::vector<llama_model_ptr> models;
|
||||||
|
std::vector<std::thread> threads;
|
||||||
|
std::atomic<bool> failed = false;
|
||||||
|
|
||||||
|
for (int m = 0; m < num_models; ++m) {
|
||||||
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
|
||||||
|
if (m < gpu_dev_count) {
|
||||||
|
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
|
mparams.main_gpu = m;
|
||||||
|
} else if (m == gpu_dev_count) {
|
||||||
|
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
|
mparams.main_gpu = -1; // CPU model
|
||||||
|
} else {
|
||||||
|
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
models.emplace_back(model);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int m = 0; m < num_models; ++m) {
|
||||||
|
auto * model = models[m].get();
|
||||||
|
for (int c = 0; c < num_contexts; ++c) {
|
||||||
|
threads.emplace_back([&, m, c, model]() {
|
||||||
|
LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);
|
||||||
|
|
||||||
|
llama_context_ptr ctx { llama_init_from_model(model, cparams) };
|
||||||
|
if (ctx == NULL) {
|
||||||
|
LOG_ERR("failed to create context\n");
|
||||||
|
failed.store(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<common_sampler, decltype(&common_sampler_free)> sampler { common_sampler_init(model, params.sampling), common_sampler_free };
|
||||||
|
if (sampler == NULL) {
|
||||||
|
LOG_ERR("failed to create sampler\n");
|
||||||
|
failed.store(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch batch = {};
|
||||||
|
{
|
||||||
|
auto prompt = common_tokenize(ctx.get(), params.prompt, true);
|
||||||
|
if (prompt.empty()) {
|
||||||
|
LOG_ERR("failed to tokenize prompt\n");
|
||||||
|
failed.store(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
batch = llama_batch_get_one(prompt.data(), prompt.size());
|
||||||
|
if (llama_decode(ctx.get(), batch)) {
|
||||||
|
LOG_ERR("failed to decode prompt\n");
|
||||||
|
failed.store(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto * vocab = llama_model_get_vocab(model);
|
||||||
|
std::string result = params.prompt;
|
||||||
|
|
||||||
|
for (int i = 0; i < params.n_predict; i++) {
|
||||||
|
llama_token token;
|
||||||
|
if (batch.n_tokens > 0) {
|
||||||
|
token = common_sampler_sample(sampler.get(), ctx.get(), batch.n_tokens - 1);
|
||||||
|
} else {
|
||||||
|
token = llama_vocab_bos(vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
result += common_token_to_piece(ctx.get(), token);
|
||||||
|
|
||||||
|
if (llama_vocab_is_eog(vocab, token)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
batch = llama_batch_get_one(&token, 1);
|
||||||
|
if (llama_decode(ctx.get(), batch)) {
|
||||||
|
LOG_ERR("Model %d/%d, Context %d/%d: failed to decode\n", m + 1, num_models, c + 1, num_contexts);
|
||||||
|
failed.store(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & thread : threads) {
|
||||||
|
thread.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (failed) {
|
||||||
|
LOG_ERR("One or more threads failed.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INF("All threads finished without errors.\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -lt 2 ]; then
|
||||||
|
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $# -eq 3 ]; then
|
||||||
|
toktest=$3
|
||||||
|
else
|
||||||
|
toktest="./test-tokenizer-0"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -x $toktest ]; then
|
||||||
|
printf "Test executable \"$toktest\" not found!\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
repo=$1
|
||||||
|
folder=$2
|
||||||
|
|
||||||
|
if [ -d $folder ] && [ -d $folder/.git ]; then
|
||||||
|
(cd $folder; git pull)
|
||||||
|
else
|
||||||
|
git clone $repo $folder
|
||||||
|
fi
|
||||||
|
|
||||||
|
shopt -s globstar
|
||||||
|
for gguf in $folder/**/*.gguf; do
|
||||||
|
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
|
||||||
|
$toktest $gguf
|
||||||
|
else
|
||||||
|
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
|
@ -187,7 +187,7 @@ struct clip_hparams {
|
||||||
float eps = 1e-6;
|
float eps = 1e-6;
|
||||||
float rope_theta = 0.0;
|
float rope_theta = 0.0;
|
||||||
|
|
||||||
std::vector<int32_t> image_grid_pinpoints;
|
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||||
int32_t image_crop_resolution;
|
int32_t image_crop_resolution;
|
||||||
std::unordered_set<int32_t> vision_feature_layer;
|
std::unordered_set<int32_t> vision_feature_layer;
|
||||||
int32_t attn_window_size = 0;
|
int32_t attn_window_size = 0;
|
||||||
|
|
@ -2109,8 +2109,7 @@ struct clip_model_loader {
|
||||||
if (is_vision) {
|
if (is_vision) {
|
||||||
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
||||||
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
||||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||||
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
|
|
||||||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
||||||
|
|
||||||
} else if (is_audio) {
|
} else if (is_audio) {
|
||||||
|
|
@ -2120,6 +2119,20 @@ struct clip_model_loader {
|
||||||
GGML_ASSERT(false && "unknown modality");
|
GGML_ASSERT(false && "unknown modality");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// for pinpoints, we need to convert it into a list of resolution candidates
|
||||||
|
{
|
||||||
|
std::vector<int> pinpoints;
|
||||||
|
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
|
||||||
|
if (!pinpoints.empty()) {
|
||||||
|
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
||||||
|
hparams.image_res_candidates.push_back({
|
||||||
|
pinpoints[i],
|
||||||
|
pinpoints[i+1],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// default warmup value
|
// default warmup value
|
||||||
hparams.warmup_image_size = hparams.image_size;
|
hparams.warmup_image_size = hparams.image_size;
|
||||||
|
|
||||||
|
|
@ -2231,16 +2244,7 @@ struct clip_model_loader {
|
||||||
{
|
{
|
||||||
hparams.rope_theta = 10000.0f;
|
hparams.rope_theta = 10000.0f;
|
||||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
|
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
|
||||||
|
set_llava_uhd_res_candidates(model, 3);
|
||||||
// borrowed from llava-1.6
|
|
||||||
const int isize = hparams.image_size;
|
|
||||||
hparams.image_grid_pinpoints = {
|
|
||||||
isize, isize*2, // 336, 672
|
|
||||||
isize*2, isize, // 672, 336
|
|
||||||
isize*2, isize*2, // 672, 672
|
|
||||||
isize*3, isize, // 1008, 336
|
|
||||||
isize, isize*3, // 336, 1008
|
|
||||||
};
|
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_ULTRAVOX:
|
case PROJECTOR_TYPE_ULTRAVOX:
|
||||||
case PROJECTOR_TYPE_QWEN2A:
|
case PROJECTOR_TYPE_QWEN2A:
|
||||||
|
|
@ -2674,6 +2678,21 @@ struct clip_model_loader {
|
||||||
output[i] = values[i];
|
output[i] = values[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
for (int x = 1; x <= max_patches_per_side; x++) {
|
||||||
|
for (int y = 1; y <= max_patches_per_side; y++) {
|
||||||
|
if (x == 1 && y == 1) {
|
||||||
|
continue; // skip the first point
|
||||||
|
}
|
||||||
|
hparams.image_res_candidates.push_back(clip_image_size{
|
||||||
|
x*hparams.image_size,
|
||||||
|
y*hparams.image_size,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
|
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
|
||||||
|
|
@ -3028,36 +3047,41 @@ struct llava_uhd {
|
||||||
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
||||||
};
|
};
|
||||||
|
|
||||||
static int get_max_slices(struct clip_ctx * ctx) {
|
|
||||||
if (clip_is_minicpmv(ctx)) {
|
|
||||||
return 9;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
||||||
slice_instructions res;
|
slice_instructions res;
|
||||||
const int patch_size = clip_get_patch_size(ctx);
|
const int patch_size = clip_get_patch_size(ctx);
|
||||||
const int slice_size = clip_get_image_size(ctx);
|
const int slice_size = clip_get_image_size(ctx);
|
||||||
const int max_slice_nums = get_max_slices(ctx);
|
|
||||||
const int original_width = original_size.width;
|
const int original_width = original_size.width;
|
||||||
const int original_height = original_size.height;
|
const int original_height = original_size.height;
|
||||||
const float log_ratio = log((float)original_width / original_height);
|
|
||||||
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
|
const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
|
||||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
|
||||||
const bool has_slices = (multiple > 1);
|
|
||||||
const bool has_pinpoints = !ctx->model.hparams.image_grid_pinpoints.empty();
|
if (!has_slices) {
|
||||||
|
// skip slicing logic
|
||||||
|
res.overview_size = clip_image_size{slice_size, slice_size};
|
||||||
|
res.refined_size = clip_image_size{0, 0};
|
||||||
|
res.grid_size = clip_image_size{0, 0};
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
if (has_pinpoints) {
|
if (has_pinpoints) {
|
||||||
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
|
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
|
||||||
auto refine_size = llava_uhd::select_best_resolution(
|
auto refine_size = llava_uhd::select_best_resolution(
|
||||||
ctx->model.hparams.image_grid_pinpoints,
|
original_size,
|
||||||
original_size);
|
ctx->model.hparams.image_res_candidates);
|
||||||
res.overview_size = clip_image_size{slice_size, slice_size};
|
res.overview_size = clip_image_size{slice_size, slice_size};
|
||||||
res.refined_size = refine_size;
|
res.refined_size = refine_size;
|
||||||
res.grid_size = clip_image_size{0, 0};
|
res.grid_size = clip_image_size{0, 0};
|
||||||
res.padding_refined = true;
|
res.padding_refined = true;
|
||||||
|
|
||||||
|
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
|
||||||
|
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
|
||||||
|
__func__, original_width, original_height,
|
||||||
|
res.overview_size.width, res.overview_size.height,
|
||||||
|
res.refined_size.width, res.refined_size.height);
|
||||||
|
|
||||||
for (int y = 0; y < refine_size.height; y += slice_size) {
|
for (int y = 0; y < refine_size.height; y += slice_size) {
|
||||||
for (int x = 0; x < refine_size.width; x += slice_size) {
|
for (int x = 0; x < refine_size.width; x += slice_size) {
|
||||||
slice_coordinates slice;
|
slice_coordinates slice;
|
||||||
|
|
@ -3066,13 +3090,16 @@ struct llava_uhd {
|
||||||
slice.size.width = std::min(slice_size, refine_size.width - x);
|
slice.size.width = std::min(slice_size, refine_size.width - x);
|
||||||
slice.size.height = std::min(slice_size, refine_size.height - y);
|
slice.size.height = std::min(slice_size, refine_size.height - y);
|
||||||
res.slices.push_back(slice);
|
res.slices.push_back(slice);
|
||||||
if (x == 0) {
|
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
|
||||||
res.grid_size.width++;
|
__func__, (int)res.slices.size() - 1,
|
||||||
}
|
slice.x, slice.y, slice.size.width, slice.size.height);
|
||||||
}
|
}
|
||||||
res.grid_size.height++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
res.grid_size.height = refine_size.height / slice_size;
|
||||||
|
res.grid_size.width = refine_size.width / slice_size;
|
||||||
|
LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3081,17 +3108,23 @@ struct llava_uhd {
|
||||||
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
|
auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices);
|
||||||
res.overview_size = best_size;
|
res.overview_size = best_size;
|
||||||
|
|
||||||
if (!has_slices) {
|
{
|
||||||
// skip slicing logic
|
const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
|
||||||
res.refined_size = clip_image_size{0, 0};
|
const float log_ratio = log((float)original_width / original_height);
|
||||||
res.grid_size = clip_image_size{0, 0};
|
const float ratio = (float)original_width * original_height / (slice_size * slice_size);
|
||||||
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
|
||||||
} else {
|
|
||||||
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
|
auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio);
|
||||||
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
|
auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
|
||||||
res.grid_size = best_grid;
|
res.grid_size = best_grid;
|
||||||
res.refined_size = refine_size;
|
res.refined_size = refine_size;
|
||||||
|
|
||||||
|
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
|
||||||
|
__func__, original_width, original_height,
|
||||||
|
res.overview_size.width, res.overview_size.height,
|
||||||
|
res.refined_size.width, res.refined_size.height,
|
||||||
|
res.grid_size.width, res.grid_size.height);
|
||||||
|
|
||||||
int width = refine_size.width;
|
int width = refine_size.width;
|
||||||
int height = refine_size.height;
|
int height = refine_size.height;
|
||||||
int grid_x = int(width / best_grid.width);
|
int grid_x = int(width / best_grid.width);
|
||||||
|
|
@ -3108,7 +3141,9 @@ struct llava_uhd {
|
||||||
slice.size.width = grid_x;
|
slice.size.width = grid_x;
|
||||||
slice.size.height = grid_y;
|
slice.size.height = grid_y;
|
||||||
res.slices.push_back(slice);
|
res.slices.push_back(slice);
|
||||||
// LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
|
LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
|
||||||
|
__func__, (int)res.slices.size() - 1,
|
||||||
|
slice.x, slice.y, slice.size.width, slice.size.height);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -3166,48 +3201,55 @@ private:
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
|
||||||
|
float scale_width = static_cast<float>(target_max.width) / orig.width;
|
||||||
|
float scale_height = static_cast<float>(target_max.height) / orig.height;
|
||||||
|
float scale = std::min(scale_width, scale_height);
|
||||||
|
return clip_image_size{
|
||||||
|
static_cast<int>(orig.width * scale),
|
||||||
|
static_cast<int>(orig.height * scale),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||||
*
|
*
|
||||||
|
* For example, when given a list of resolutions:
|
||||||
|
* - 100x100
|
||||||
|
* - 200x100
|
||||||
|
* - 100x200
|
||||||
|
* - 200x200
|
||||||
|
*
|
||||||
|
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
||||||
|
*
|
||||||
* @param original_size The original size of the image
|
* @param original_size The original size of the image
|
||||||
* @param possible_resolutions A list of possible resolutions
|
* @param possible_resolutions A list of possible resolutions
|
||||||
* @return The best fit resolution
|
* @return The best fit resolution
|
||||||
*/
|
*/
|
||||||
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
|
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
|
||||||
int original_width = original_size.width;
|
|
||||||
int original_height = original_size.height;
|
|
||||||
clip_image_size best_fit;
|
clip_image_size best_fit;
|
||||||
|
int min_wasted_area = std::numeric_limits<int>::max();
|
||||||
int max_effective_resolution = 0;
|
int max_effective_resolution = 0;
|
||||||
int min_wasted_resolution = std::numeric_limits<int>::max();
|
|
||||||
|
|
||||||
for (const auto & resolution : possible_resolutions) {
|
for (const clip_image_size & candidate : possible_resolutions) {
|
||||||
int width = resolution.width;
|
auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
|
||||||
int height = resolution.height;
|
int effective_resolution = std::min(
|
||||||
float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
|
target_size.width * target_size.height,
|
||||||
int downscaled_width = static_cast<int>(original_width * scale);
|
original_size.width * original_size.height);
|
||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
int wasted_area = (candidate.width * candidate.height) - effective_resolution;
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
|
||||||
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
|
||||||
max_effective_resolution = effective_resolution;
|
max_effective_resolution = effective_resolution;
|
||||||
min_wasted_resolution = wasted_resolution;
|
min_wasted_area = wasted_area;
|
||||||
best_fit = resolution;
|
best_fit = candidate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
|
||||||
}
|
}
|
||||||
|
|
||||||
return best_fit;
|
return best_fit;
|
||||||
}
|
}
|
||||||
|
|
||||||
// used by llava 1.6 with custom list of pinpoints
|
|
||||||
static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
|
|
||||||
std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
|
|
||||||
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
|
||||||
possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
|
|
||||||
}
|
|
||||||
return select_best_resolution(original_size, possible_resolutions);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ensure_divide(int length, int patch_size) {
|
static int ensure_divide(int length, int patch_size) {
|
||||||
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||||
}
|
}
|
||||||
|
|
@ -3331,7 +3373,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
|
} else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
|
||||||
GGML_ASSERT(!params.image_grid_pinpoints.empty());
|
GGML_ASSERT(!params.image_res_candidates.empty());
|
||||||
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
||||||
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
||||||
|
|
||||||
|
|
@ -3371,7 +3413,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
res_imgs->entries.push_back(std::move(res));
|
res_imgs->entries.push_back(std::move(res));
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
} else if (!params.image_grid_pinpoints.empty()) {
|
} else if (!params.image_res_candidates.empty()) {
|
||||||
// "spatial_unpad" with "anyres" processing for llava-1.6
|
// "spatial_unpad" with "anyres" processing for llava-1.6
|
||||||
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
||||||
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
||||||
|
|
@ -3431,17 +3473,6 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
|
||||||
return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
|
return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
|
||||||
if (ctx->model.hparams.image_grid_pinpoints.size()) {
|
|
||||||
return &ctx->model.hparams.image_grid_pinpoints.front();
|
|
||||||
}
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
|
|
||||||
return ctx->model.hparams.image_grid_pinpoints.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->model.hparams;
|
const auto & params = ctx->model.hparams;
|
||||||
const int n_total = clip_n_output_tokens(ctx, img);
|
const int n_total = clip_n_output_tokens(ctx, img);
|
||||||
|
|
|
||||||
|
|
@ -46,9 +46,6 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
|
||||||
// TODO: should be enum, not string
|
// TODO: should be enum, not string
|
||||||
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
||||||
size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||||
|
|
||||||
// for M-RoPE, this will be the number of token positions in X and Y directions
|
// for M-RoPE, this will be the number of token positions in X and Y directions
|
||||||
|
|
|
||||||
|
|
@ -501,7 +501,10 @@ struct mtmd_tokenizer {
|
||||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||||
) {
|
) {
|
||||||
|
const int n_col = batch_f32.grid_x;
|
||||||
|
const int n_row = batch_f32.grid_y;
|
||||||
// split batch into chunks of single images
|
// split batch into chunks of single images
|
||||||
|
// NOTE: batch_f32 will be invalidated after this call
|
||||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
|
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
|
||||||
GGML_ASSERT(chunks.size() > 0);
|
GGML_ASSERT(chunks.size() > 0);
|
||||||
|
|
||||||
|
|
@ -521,8 +524,7 @@ struct mtmd_tokenizer {
|
||||||
|
|
||||||
// add slices (or tiles)
|
// add slices (or tiles)
|
||||||
if (!chunks.empty()) {
|
if (!chunks.empty()) {
|
||||||
const int n_col = batch_f32.grid_x;
|
GGML_ASSERT((int)chunks.size() == n_row * n_col);
|
||||||
const int n_row = batch_f32.grid_y;
|
|
||||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||||
add_text({ctx->tok_slices_start});
|
add_text({ctx->tok_slices_start});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -88,6 +88,26 @@ enum error_type {
|
||||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static bool server_task_type_need_embd(server_task_type task_type) {
|
||||||
|
switch (task_type) {
|
||||||
|
case SERVER_TASK_TYPE_EMBEDDING:
|
||||||
|
case SERVER_TASK_TYPE_RERANK:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool server_task_type_need_logits(server_task_type task_type) {
|
||||||
|
switch (task_type) {
|
||||||
|
case SERVER_TASK_TYPE_COMPLETION:
|
||||||
|
case SERVER_TASK_TYPE_INFILL:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct slot_params {
|
struct slot_params {
|
||||||
bool stream = true;
|
bool stream = true;
|
||||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||||
|
|
@ -233,6 +253,7 @@ struct server_task {
|
||||||
slot_params defaults;
|
slot_params defaults;
|
||||||
defaults.sampling = params_base.sampling;
|
defaults.sampling = params_base.sampling;
|
||||||
defaults.speculative = params_base.speculative;
|
defaults.speculative = params_base.speculative;
|
||||||
|
defaults.n_keep = params_base.n_keep;
|
||||||
|
|
||||||
// enabling this will output extra debug information in the HTTP responses from the server
|
// enabling this will output extra debug information in the HTTP responses from the server
|
||||||
params.verbose = params_base.verbosity > 9;
|
params.verbose = params_base.verbosity > 9;
|
||||||
|
|
@ -1329,13 +1350,24 @@ struct server_slot {
|
||||||
n_draft_accepted = 0;
|
n_draft_accepted = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_non_causal() const {
|
bool need_embd() const {
|
||||||
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
return server_task_type_need_embd(task_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool need_logits() const {
|
||||||
|
return server_task_type_need_logits(task_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||||
|
// also we cannot split if the pooling would require any past tokens
|
||||||
|
bool can_split() const {
|
||||||
|
return
|
||||||
|
!need_embd() ||
|
||||||
|
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool can_batch_with(server_slot & other_slot) const {
|
bool can_batch_with(server_slot & other_slot) const {
|
||||||
return is_non_causal() == other_slot.is_non_causal()
|
return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
|
||||||
&& are_lora_equal(lora, other_slot.lora);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool has_budget(const common_params & global_params) {
|
bool has_budget(const common_params & global_params) {
|
||||||
|
|
@ -1479,7 +1511,6 @@ struct server_slot {
|
||||||
{"n_ctx", n_ctx},
|
{"n_ctx", n_ctx},
|
||||||
{"speculative", can_speculate()},
|
{"speculative", can_speculate()},
|
||||||
{"is_processing", is_processing()},
|
{"is_processing", is_processing()},
|
||||||
{"non_causal", is_non_causal()},
|
|
||||||
{"params", params.to_json()},
|
{"params", params.to_json()},
|
||||||
{"prompt", prompt_tokens.detokenize(ctx, true)},
|
{"prompt", prompt_tokens.detokenize(ctx, true)},
|
||||||
{"next_token",
|
{"next_token",
|
||||||
|
|
@ -2016,11 +2047,6 @@ struct server_context {
|
||||||
params_base.n_cache_reuse = 0;
|
params_base.n_cache_reuse = 0;
|
||||||
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params_base.speculative.model.path.empty()) {
|
|
||||||
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -2060,6 +2086,7 @@ struct server_context {
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||||
|
|
||||||
slot.params.sampling = params_base.sampling;
|
slot.params.sampling = params_base.sampling;
|
||||||
|
slot.params.n_keep = params_base.n_keep;
|
||||||
|
|
||||||
slot.callback_on_release = [this](int) {
|
slot.callback_on_release = [this](int) {
|
||||||
queue_tasks.pop_deferred_task();
|
queue_tasks.pop_deferred_task();
|
||||||
|
|
@ -2733,6 +2760,7 @@ struct server_context {
|
||||||
queue_tasks.defer(std::move(task));
|
queue_tasks.defer(std::move(task));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot->is_processing()) {
|
if (slot->is_processing()) {
|
||||||
// if requested slot is unavailable, we defer this task for processing later
|
// if requested slot is unavailable, we defer this task for processing later
|
||||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||||
|
|
@ -3095,7 +3123,14 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.is_non_causal()) {
|
// TODO: support memory-less logits computation
|
||||||
|
if (slot.need_logits() && !llama_get_memory(ctx)) {
|
||||||
|
slot.release();
|
||||||
|
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!slot.can_split()) {
|
||||||
if (slot.n_prompt_tokens > n_ubatch) {
|
if (slot.n_prompt_tokens > n_ubatch) {
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
||||||
|
|
@ -3220,7 +3255,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto n_swa = llama_model_n_swa(model);
|
const auto n_swa = llama_model_n_swa(model);
|
||||||
if (pos_min > slot.n_past - n_swa) {
|
if (pos_min > std::max(0, slot.n_past - n_swa)) {
|
||||||
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
|
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
|
||||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
||||||
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
||||||
|
|
@ -3230,8 +3265,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
|
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
||||||
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
|
||||||
|
|
||||||
slot.n_past--;
|
slot.n_past--;
|
||||||
}
|
}
|
||||||
|
|
@ -3239,8 +3273,7 @@ struct server_context {
|
||||||
slot.n_prompt_tokens_processed = 0;
|
slot.n_prompt_tokens_processed = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// non-causal tasks require to fit the entire prompt in the physical batch
|
if (!slot.can_split()) {
|
||||||
if (slot.is_non_causal()) {
|
|
||||||
// cannot fit the prompt in the current batch - will try next iter
|
// cannot fit the prompt in the current batch - will try next iter
|
||||||
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -3262,8 +3295,7 @@ struct server_context {
|
||||||
slot.cache_tokens.keep_first(slot.n_past);
|
slot.cache_tokens.keep_first(slot.n_past);
|
||||||
|
|
||||||
// check if we should process the image
|
// check if we should process the image
|
||||||
if (slot.n_past < slot.n_prompt_tokens
|
if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
||||||
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
|
||||||
// process the image
|
// process the image
|
||||||
int32_t new_n_past;
|
int32_t new_n_past;
|
||||||
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
||||||
|
|
@ -3294,8 +3326,8 @@ struct server_context {
|
||||||
break; // end of text chunk
|
break; // end of text chunk
|
||||||
}
|
}
|
||||||
|
|
||||||
// without pooling, we want to output the embeddings for all the tokens in the batch
|
// embedding requires all tokens in the batch to be output
|
||||||
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
const bool need_embd = server_task_type_need_embd(slot.task_type);
|
||||||
|
|
||||||
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
||||||
slot.cache_tokens.push_back(cur_tok);
|
slot.cache_tokens.push_back(cur_tok);
|
||||||
|
|
@ -3349,17 +3381,15 @@ struct server_context {
|
||||||
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
||||||
|
|
||||||
if (slot_batched) {
|
if (slot_batched) {
|
||||||
// make sure we're in the right embedding mode
|
|
||||||
llama_set_embeddings(ctx, slot_batched->is_non_causal());
|
|
||||||
// apply lora, only need to do it once per batch
|
// apply lora, only need to do it once per batch
|
||||||
common_set_adapter_lora(ctx, slot_batched->lora);
|
common_set_adapter_lora(ctx, slot_batched->lora);
|
||||||
}
|
|
||||||
|
|
||||||
const bool do_encode = (params_base.embedding || params_base.reranking);
|
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||||
|
}
|
||||||
|
|
||||||
// pad the batch so that batch.n_tokens >= n_slots
|
// pad the batch so that batch.n_tokens >= n_slots
|
||||||
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
|
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
|
||||||
if (do_encode) {
|
if (slot_batched->need_embd()) {
|
||||||
const int n_slots = slots.size();
|
const int n_slots = slots.size();
|
||||||
|
|
||||||
if (batch.n_tokens < n_slots) {
|
if (batch.n_tokens < n_slots) {
|
||||||
|
|
@ -3381,8 +3411,11 @@ struct server_context {
|
||||||
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
|
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
|
||||||
|
|
||||||
for (int j = 0; j < n_add; ++j) {
|
for (int j = 0; j < n_add; ++j) {
|
||||||
common_batch_add(batch, 0, j, { seq_id }, false);
|
common_batch_add(batch, 0, j, { seq_id }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slots[seq_id].cache_tokens.clear();
|
||||||
|
llama_memory_seq_rm(llama_get_memory(ctx), seq_id, -1, -1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4177,11 +4210,6 @@ int main(int argc, char ** argv) {
|
||||||
oaicompat_type oaicompat) -> void {
|
oaicompat_type oaicompat) -> void {
|
||||||
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
||||||
|
|
||||||
if (ctx_server.params_base.embedding) {
|
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto completion_id = gen_chatcmplid();
|
auto completion_id = gen_chatcmplid();
|
||||||
std::unordered_set<int> task_ids;
|
std::unordered_set<int> task_ids;
|
||||||
try {
|
try {
|
||||||
|
|
@ -4436,12 +4464,8 @@ int main(int argc, char ** argv) {
|
||||||
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||||
LOG_DBG("request: %s\n", req.body.c_str());
|
LOG_DBG("request: %s\n", req.body.c_str());
|
||||||
if (ctx_server.params_base.embedding) {
|
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto body = json::parse(req.body);
|
auto body = json::parse(req.body);
|
||||||
std::vector<raw_buffer> files;
|
std::vector<raw_buffer> files;
|
||||||
|
|
@ -4569,13 +4593,18 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
|
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
|
||||||
const json body = json::parse(req.body);
|
if (!ctx_server.params_base.embedding) {
|
||||||
|
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
||||||
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
// for the shape of input/content, see tokenize_input_prompts()
|
// for the shape of input/content, see tokenize_input_prompts()
|
||||||
json prompt;
|
json prompt;
|
||||||
if (body.count("input") != 0) {
|
if (body.count("input") != 0) {
|
||||||
|
|
@ -4665,8 +4694,8 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
|
if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
|
||||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4881,7 +4910,9 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
bool was_bound = false;
|
bool was_bound = false;
|
||||||
|
bool is_sock = false;
|
||||||
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
||||||
|
is_sock = true;
|
||||||
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
||||||
svr->set_address_family(AF_UNIX);
|
svr->set_address_family(AF_UNIX);
|
||||||
// bind_to_port requires a second arg, any value other than 0 should
|
// bind_to_port requires a second arg, any value other than 0 should
|
||||||
|
|
@ -4959,7 +4990,9 @@ int main(int argc, char ** argv) {
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
|
||||||
|
is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
|
||||||
|
string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
|
||||||
|
|
||||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||||
ctx_server.queue_tasks.start_loop();
|
ctx_server.queue_tasks.start_loop();
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,10 @@ html {
|
||||||
max-width: 900px;
|
max-width: 900px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.chat-bubble {
|
||||||
|
@apply break-words;
|
||||||
|
}
|
||||||
|
|
||||||
.chat-bubble-base-300 {
|
.chat-bubble-base-300 {
|
||||||
--tw-bg-opacity: 1;
|
--tw-bg-opacity: 1;
|
||||||
--tw-text-opacity: 1;
|
--tw-text-opacity: 1;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue