From 1a95cf32745ca3d75a3a09c948812f093113f1a0 Mon Sep 17 00:00:00 2001 From: Yuta Hayashibe Date: Sat, 24 Feb 2024 20:25:07 +0900 Subject: [PATCH 01/26] Add --eot_line option --- run.cc | 19 ++++++++++++++++--- util/app.h | 6 ++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/run.cc b/run.cc index 87d8445..526ea8f 100644 --- a/run.cc +++ b/run.cc @@ -79,7 +79,9 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, const InferenceArgs& args, - int verbosity, const gcpp::AcceptFunc& accept_token) { + int verbosity, const gcpp::AcceptFunc& accept_token, + std::string &eot_line +) { PROFILER_ZONE("Gen.misc"); int abs_pos = 0; // absolute token index over all turns int current_pos = 0; // token index within the current turn @@ -137,7 +139,18 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, if (verbosity >= 1) { std::cout << "> " << std::flush; } - std::getline(std::cin, prompt_string); + + if (eot_line.size() == 0) { + std::getline(std::cin, prompt_string); + } else { + std::string line; + while (std::getline(std::cin, line)) { + if (line == eot_line) { + break; + } + prompt_string += line + "\n"; + } + } } if (std::cin.fail() || prompt_string == "%q" || prompt_string == "%Q") { @@ -231,7 +244,7 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { } ReplGemma(model, pool, inner_pool, inference, app.verbosity, - /*accept_token=*/[](int) { return true; }); + /*accept_token=*/[](int) { return true; }, app.eot_line); } } // namespace gcpp diff --git a/util/app.h b/util/app.h index 966fa41..8eb672b 100644 --- a/util/app.h +++ b/util/app.h @@ -62,6 +62,7 @@ class AppArgs : public ArgsBase { Path log; // output int verbosity; size_t num_threads; + std::string eot_line; template void ForEach(const Visitor& visitor) { @@ -77,6 +78,11 @@ class AppArgs : public ArgsBase { "estimate of " "how many concurrent threads are supported.", 2); + visitor(eot_line, "eot_line", std::string(""), + "End of turn line. " + "When you specify this, the prompt will be all lines " + "before the line where only the given string appears.", + 2); } }; From 5fe31ad0bc6e4a2e0ecca57ff591ae23cf4ea16d Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Sat, 24 Feb 2024 12:54:47 -0800 Subject: [PATCH 02/26] Copybara: rename BUILD to BUILD.bazel. PiperOrigin-RevId: 610039263 --- run.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/run.cc b/run.cc index 96ba316..87d8445 100644 --- a/run.cc +++ b/run.cc @@ -144,11 +144,6 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, return; } - if (prompt_string == "%c" || prompt_string == "%C") { - abs_pos = 0; - continue; - } - if (model.model_training == ModelTraining::GEMMA_IT) { // For instruction-tuned models: add control tokens. prompt_string = "user\n" + prompt_string + From 3af439621e360b54c6cad660c54928a40a92d68c Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Sat, 24 Feb 2024 14:52:59 -0800 Subject: [PATCH 03/26] Rename BUILD to BUILD.bazel. (#36) * Rename BUILD to BUILD.bazel. This fixes an error on macOS due to `build` and `BUILD` having conflicting names. * Enable macos-latest in GitHub Actions CI. * Fix concurrency key in GitHub Actions. Use matrix configuration in concurrency key. --- .github/workflows/build.yml | 8 +++--- BUILD.bazel | 51 ------------------------------------- 2 files changed, 4 insertions(+), 55 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 929e140..b0d4b6e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,11 +1,11 @@ -name: Build +name: build # Trigger on push or via manual dispatch. on: [push, workflow_dispatch] jobs: build: - runs-on: ${{matrix.os}} + runs-on: ${{ matrix.os }} name: ${{ matrix.os }} ${{ matrix.type }} timeout-minutes: 30 @@ -13,10 +13,10 @@ jobs: fail-fast: false matrix: type: ['Release'] - os: ['ubuntu-latest'] + os: ['ubuntu-latest', 'macos-latest'] concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }} cancel-in-progress: true steps: diff --git a/BUILD.bazel b/BUILD.bazel index 190690b..18dad30 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -114,54 +114,3 @@ cc_binary( "//:thread_pool", ], ) - -# copybara:strip_begin -cc_binary( - name = "run_csv", - srcs = [ - "run_csv.cc", - ], - deps = [ - ":app", - ":args", - ":gemma_lib", - "//compression:compress", - # copybara:import_next_line:hwy - "//:hwy", - # copybara:import_next_line:hwy - "//:nanobenchmark", - # copybara:import_next_line:hwy - "//:profiler", - # copybara:import_next_line:hwy - "//:thread_pool", - "//third_party/riegeli/bytes:file_reader", - "//third_party/riegeli/bytes:file_writer", - "//third_party/riegeli/csv:csv_reader", - "//third_party/riegeli/csv:csv_writer", - ], -) - -gensignature( - name = "gemma_sign", - srcs = [":gemma"], -) - -cc_test( - name = "benchmarks", - size = "large", - srcs = [ - "benchmarks.cc", - ], - tags = ["notap"], - deps = [ - ":app", - ":gemma_lib", - "//third_party/benchmark", - # copybara:import_next_line:hwy - "//:hwy", - # copybara:import_next_line:hwy - "//:thread_pool", - ], -) - -# copybara:strip_end From 621434e424d5f35b2913d867ddd7781b1c6cb1ad Mon Sep 17 00:00:00 2001 From: Naoki Kishida Date: Sun, 25 Feb 2024 07:21:01 +0900 Subject: [PATCH 04/26] reset conversation (#34) --- run.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/run.cc b/run.cc index 87d8445..96ba316 100644 --- a/run.cc +++ b/run.cc @@ -144,6 +144,11 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, return; } + if (prompt_string == "%c" || prompt_string == "%C") { + abs_pos = 0; + continue; + } + if (model.model_training == ModelTraining::GEMMA_IT) { // For instruction-tuned models: add control tokens. prompt_string = "user\n" + prompt_string + From 84444c93a44f484442fda2523dde7e77dbd3a53c Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Sat, 24 Feb 2024 15:14:53 -0800 Subject: [PATCH 05/26] Revert "Copybara configuration update." This reverts commit c03b5da542ef19f65a4147a52ccac7c89334e7f3. Restore lost changes due to improper Copybara syncing. --- .github/workflows/build.yml | 4 +- CMakeLists.txt | 9 ++-- CMakePresets.json | 59 ++++++++++++++++++++++ README.md | 42 +++++++++++++--- compression/blob_store.cc | 97 +++++++++++++++++++++++++++++++++---- util/app.h | 2 + 6 files changed, 191 insertions(+), 22 deletions(-) create mode 100644 CMakePresets.json diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b0d4b6e..da63c1c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,7 +1,7 @@ name: build -# Trigger on push or via manual dispatch. -on: [push, workflow_dispatch] +# Trigger on push, pull request, or via manual dispatch. +on: [push, pull_request, workflow_dispatch] jobs: build: diff --git a/CMakeLists.txt b/CMakeLists.txt index 3858968..c7828cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f) FetchContent_MakeAvailable(highway) -## Note: absl meeds tp be installed by sentencepiece. This will only happen if +## Note: absl needs to be installed by sentencepiece. This will only happen if ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c) FetchContent_MakeAvailable(sentencepiece) @@ -43,14 +43,13 @@ set(SOURCES util/args.h ) -add_compile_options($<$:-O2>) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() # Allowable types for WEIGHT_TYPE: # float - slow, not recommended -# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway +# hwy::bfloat16_t - bfloat16 as implemented by https://github.com/google/highway # SfpStream - 8-bit switched floating point (recommended) # NuqStream - experimental, work-in-progress option(WEIGHT_TYPE "Set weight type" "") @@ -68,6 +67,8 @@ target_link_libraries(gemma hwy hwy_contrib sentencepiece) target_include_directories(gemma PRIVATE ./) FetchContent_GetProperties(sentencepiece) target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR}) +target_compile_definitions(gemma PRIVATE $<$:_CRT_SECURE_NO_WARNINGS NOMINMAX>) +target_compile_options(gemma PRIVATE $<$:-Wno-deprecated-declarations>) ## Library Target @@ -77,3 +78,5 @@ set_target_properties(libgemma PROPERTIES PREFIX "") target_include_directories(libgemma PUBLIC ./) target_link_libraries(libgemma hwy hwy_contrib sentencepiece) target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR}) +target_compile_definitions(libgemma PRIVATE $<$:_CRT_SECURE_NO_WARNINGS NOMINMAX>) +target_compile_options(libgemma PRIVATE $<$:-Wno-deprecated-declarations>) diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 0000000..5fe13c8 --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,59 @@ +{ + "version": 3, + "cmakeMinimumRequired": { + "major": 3, + "minor": 11, + "patch": 0 + }, + "configurePresets": [ + { + "name": "__defaults__", + "hidden": true, + "binaryDir": "${sourceDir}/build" + }, + { + "name": "make", + "inherits": "__defaults__", + "displayName": "Make", + "description": "Unix Makefiles", + "generator": "Unix Makefiles", + "binaryDir": "${sourceDir}/build" + }, + { + "name": "windows", + "inherits": "__defaults__", + "displayName": "Windows", + "description": "Visual Studio 2022 with Clang/LLVM frontend", + "generator": "Visual Studio 17 2022", + "toolset": "ClangCL", + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + } + ], + "buildPresets": [ + { + "name": "__defaults__", + "hidden": true, + "targets": [ + "gemma", + "libgemma" + ] + }, + { + "name": "make", + "inherits": "__defaults__", + "displayName": "Unix Makefiles", + "configurePreset": "make" + }, + { + "name": "windows", + "inherits": "__defaults__", + "displayName": "Windows", + "configuration": "Release", + "configurePreset": "windows" + } + ] + } diff --git a/README.md b/README.md index e278833..ff1011b 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,16 @@ Before starting, you should have installed: least C++17. - `tar` for extracting archives from Kaggle. +Building natively on Windows requires the Visual Studio 2012 Build Tools with the +optional Clang/LLVM C++ frontend (`clang-cl`). This can be installed from the +command line with +[`winget`](https://learn.microsoft.com/en-us/windows/package-manager/winget/): + +```sh +winget install --id Kitware.CMake +winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset" +``` + ### Step 1: Obtain model weights and tokenizer from Kaggle Visit [the Gemma model page on @@ -107,6 +117,7 @@ runtime, create a build directory and generate the build files using `cmake` from the top-level project directory. For the 8-bit switched floating point weights (sfp), run cmake with no options: +#### Unix-like Platforms ```sh cmake -B build ``` @@ -126,17 +137,18 @@ your weights, you can enter the `build/` directory and run `make` to build the `./gemma` executable: ```sh -cd build -make -j [number of parallel threads to use] gemma +# Configure `build` directory +cmake --preset make + +# Build project using make +cmake --build --preset make -j [number of parallel threads to use] ``` Replace `[number of parallel threads to use]` with a number - the number of -cores available on your system is a reasonable heuristic. - -For example, `make -j4 gemma` will build using 4 threads. If this is successful, -you should now have a `gemma` executable in the `build/` directory. If the -`nproc` command is available, you can use `make -j$(nproc) gemma` as a -reasonable default for the number of threads. +cores available on your system is a reasonable heuristic. For example, +`make -j4 gemma` will build using 4 threads. If the `nproc` command is +available, you can use `make -j$(nproc) gemma` as a reasonable default +for the number of threads. If you aren't sure of the right value for the `-j` flag, you can simply run `make gemma` instead and it should still build the `./gemma` executable. @@ -145,6 +157,20 @@ If you aren't sure of the right value for the `-j` flag, you can simply run > On Windows Subsystem for Linux (WSL) users should set the number of > parallel threads to 1. Using a larger number may result in errors. +If the build is successful, you should now have a `gemma` executable in the `build/` directory. + +#### Windows + +```sh +# Configure `build` directory +cmake --preset windows + +# Build project using Visual Studio Build Tools +cmake --build --preset windows -j [number of parallel threads to use] +``` + +If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory. + ### Step 4: Run You can now run `gemma` from inside the `build/` directory. diff --git a/compression/blob_store.cc b/compression/blob_store.cc index 8d6c1d0..550c727 100644 --- a/compression/blob_store.cc +++ b/compression/blob_store.cc @@ -16,11 +16,16 @@ // copybara:import_next_line:gemma_cpp #include "compression/blob_store.h" -#include // open #include #include // SEEK_END - unistd isn't enough for IDE. #include // O_RDONLY -#include // read, close +#include // open +#if HWY_OS_WIN +#include // read, write, close +#include +#else +#include // read, write, close +#endif #include #include @@ -30,6 +35,54 @@ #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/detect_compiler_arch.h" +namespace { +#if HWY_OS_WIN + +// pread is not supported on Windows +static int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) { + HANDLE file = reinterpret_cast(_get_osfhandle(fd)); + if (file == INVALID_HANDLE_VALUE) { + return -1; + } + + OVERLAPPED overlapped = {0}; + overlapped.Offset = offset & 0xFFFFFFFF; + overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF; + + DWORD bytes_read; + if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) { + if (GetLastError() != ERROR_HANDLE_EOF) { + return -1; + } + } + + return bytes_read; +} + +// pwrite is not supported on Windows +static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) { + HANDLE file = reinterpret_cast(_get_osfhandle(fd)); + if (file == INVALID_HANDLE_VALUE) { + return -1; + } + + OVERLAPPED overlapped = {0}; + overlapped.Offset = offset & 0xFFFFFFFF; + overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF; + + DWORD bytes_written; + if (!WriteFile(file, buf, size, &bytes_written, &overlapped)) { + if (GetLastError() != ERROR_HANDLE_EOF) { + return -1; + } + } + + return bytes_written; +} + +#endif +} + namespace gcpp { hwy::uint128_t MakeKey(const char* string) { @@ -64,19 +117,30 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data, } } + struct IO { // Returns size in bytes or 0. static uint64_t FileSize(const char* filename) { int fd = open(filename, O_RDONLY); - if (fd >= 0) { - const off_t size = lseek(fd, 0, SEEK_END); - HWY_ASSERT(close(fd) != -1); - if (size != static_cast(-1)) { - return static_cast(size); - } + if (fd < 0) { + return 0; } - return 0; +#if HWY_OS_WIN + const int64_t size = _lseeki64(fd, 0, SEEK_END); + HWY_ASSERT(close(fd) != -1); + if (size < 0) { + return 0; + } +#else + const off_t size = lseek(fd, 0, SEEK_END); + HWY_ASSERT(close(fd) != -1); + if (size == static_cast(-1)) { + return 0; + } +#endif + + return static_cast(size); } static bool Read(int fd, uint64_t offset, uint64_t size, void* to) { @@ -252,7 +316,14 @@ class BlobStore { #pragma pack(pop) BlobError BlobReader::Open(const char* filename) { +#if HWY_OS_WIN + DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN; + HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr); + if (file == INVALID_HANDLE_VALUE) return __LINE__; + fd_ = _open_osfhandle(reinterpret_cast(file), _O_RDONLY); +#else fd_ = open(filename, O_RDONLY); +#endif if (fd_ < 0) return __LINE__; #if _POSIX_C_SOURCE >= 200112L @@ -330,7 +401,14 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, keys_.data(), blobs_.data(), keys_.size()); // Create/replace existing file. +#if HWY_OS_WIN + DWORD flags = FILE_ATTRIBUTE_NORMAL; + HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr); + if (file == INVALID_HANDLE_VALUE) return __LINE__; + const int fd = _open_osfhandle(reinterpret_cast(file), _O_WRONLY); +#else const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644); +#endif if (fd < 0) return __LINE__; std::atomic_flag err = ATOMIC_FLAG_INIT; @@ -341,6 +419,7 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, err.test_and_set(); } }); + HWY_ASSERT(close(fd) != -1); if (err.test_and_set()) return __LINE__; return 0; } diff --git a/util/app.h b/util/app.h index 966fa41..bd665a4 100644 --- a/util/app.h +++ b/util/app.h @@ -18,7 +18,9 @@ #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_ #define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_ +#if HWY_OS_LINUX #include +#endif #include #include // std::clamp From 696597383cabbd7e78c5e581e6425b452f267ab1 Mon Sep 17 00:00:00 2001 From: Silvio Traversaro Date: Sat, 24 Feb 2024 20:41:04 -0800 Subject: [PATCH 06/26] Copybara import of the project: -- 19694e1f2e62d1c26a69309d727f2dbc5d9ada14 by Silvio Traversaro : Do not pass explicitly -O2 flag to compiler in Release build COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/3 from traversaro:patch-1 19694e1f2e62d1c26a69309d727f2dbc5d9ada14 PiperOrigin-RevId: 610096914 --- .github/workflows/build.yml | 12 ++--- BUILD.bazel | 51 +++++++++++++++++++ CMakeLists.txt | 8 +-- CMakePresets.json | 59 ---------------------- README.md | 42 +++------------- compression/blob_store.cc | 97 ++++--------------------------------- run.cc | 5 -- util/app.h | 2 - 8 files changed, 76 insertions(+), 200 deletions(-) delete mode 100644 CMakePresets.json diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da63c1c..929e140 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,11 +1,11 @@ -name: build +name: Build -# Trigger on push, pull request, or via manual dispatch. -on: [push, pull_request, workflow_dispatch] +# Trigger on push or via manual dispatch. +on: [push, workflow_dispatch] jobs: build: - runs-on: ${{ matrix.os }} + runs-on: ${{matrix.os}} name: ${{ matrix.os }} ${{ matrix.type }} timeout-minutes: 30 @@ -13,10 +13,10 @@ jobs: fail-fast: false matrix: type: ['Release'] - os: ['ubuntu-latest', 'macos-latest'] + os: ['ubuntu-latest'] concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }} + group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true steps: diff --git a/BUILD.bazel b/BUILD.bazel index 18dad30..190690b 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -114,3 +114,54 @@ cc_binary( "//:thread_pool", ], ) + +# copybara:strip_begin +cc_binary( + name = "run_csv", + srcs = [ + "run_csv.cc", + ], + deps = [ + ":app", + ":args", + ":gemma_lib", + "//compression:compress", + # copybara:import_next_line:hwy + "//:hwy", + # copybara:import_next_line:hwy + "//:nanobenchmark", + # copybara:import_next_line:hwy + "//:profiler", + # copybara:import_next_line:hwy + "//:thread_pool", + "//third_party/riegeli/bytes:file_reader", + "//third_party/riegeli/bytes:file_writer", + "//third_party/riegeli/csv:csv_reader", + "//third_party/riegeli/csv:csv_writer", + ], +) + +gensignature( + name = "gemma_sign", + srcs = [":gemma"], +) + +cc_test( + name = "benchmarks", + size = "large", + srcs = [ + "benchmarks.cc", + ], + tags = ["notap"], + deps = [ + ":app", + ":gemma_lib", + "//third_party/benchmark", + # copybara:import_next_line:hwy + "//:hwy", + # copybara:import_next_line:hwy + "//:thread_pool", + ], +) + +# copybara:strip_end diff --git a/CMakeLists.txt b/CMakeLists.txt index c7828cc..722e408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f) FetchContent_MakeAvailable(highway) -## Note: absl needs to be installed by sentencepiece. This will only happen if +## Note: absl meeds tp be installed by sentencepiece. This will only happen if ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c) FetchContent_MakeAvailable(sentencepiece) @@ -49,7 +49,7 @@ endif() # Allowable types for WEIGHT_TYPE: # float - slow, not recommended -# hwy::bfloat16_t - bfloat16 as implemented by https://github.com/google/highway +# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway # SfpStream - 8-bit switched floating point (recommended) # NuqStream - experimental, work-in-progress option(WEIGHT_TYPE "Set weight type" "") @@ -67,8 +67,6 @@ target_link_libraries(gemma hwy hwy_contrib sentencepiece) target_include_directories(gemma PRIVATE ./) FetchContent_GetProperties(sentencepiece) target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR}) -target_compile_definitions(gemma PRIVATE $<$:_CRT_SECURE_NO_WARNINGS NOMINMAX>) -target_compile_options(gemma PRIVATE $<$:-Wno-deprecated-declarations>) ## Library Target @@ -78,5 +76,3 @@ set_target_properties(libgemma PROPERTIES PREFIX "") target_include_directories(libgemma PUBLIC ./) target_link_libraries(libgemma hwy hwy_contrib sentencepiece) target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR}) -target_compile_definitions(libgemma PRIVATE $<$:_CRT_SECURE_NO_WARNINGS NOMINMAX>) -target_compile_options(libgemma PRIVATE $<$:-Wno-deprecated-declarations>) diff --git a/CMakePresets.json b/CMakePresets.json deleted file mode 100644 index 5fe13c8..0000000 --- a/CMakePresets.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "version": 3, - "cmakeMinimumRequired": { - "major": 3, - "minor": 11, - "patch": 0 - }, - "configurePresets": [ - { - "name": "__defaults__", - "hidden": true, - "binaryDir": "${sourceDir}/build" - }, - { - "name": "make", - "inherits": "__defaults__", - "displayName": "Make", - "description": "Unix Makefiles", - "generator": "Unix Makefiles", - "binaryDir": "${sourceDir}/build" - }, - { - "name": "windows", - "inherits": "__defaults__", - "displayName": "Windows", - "description": "Visual Studio 2022 with Clang/LLVM frontend", - "generator": "Visual Studio 17 2022", - "toolset": "ClangCL", - "condition": { - "type": "equals", - "lhs": "${hostSystemName}", - "rhs": "Windows" - } - } - ], - "buildPresets": [ - { - "name": "__defaults__", - "hidden": true, - "targets": [ - "gemma", - "libgemma" - ] - }, - { - "name": "make", - "inherits": "__defaults__", - "displayName": "Unix Makefiles", - "configurePreset": "make" - }, - { - "name": "windows", - "inherits": "__defaults__", - "displayName": "Windows", - "configuration": "Release", - "configurePreset": "windows" - } - ] - } diff --git a/README.md b/README.md index ff1011b..e278833 100644 --- a/README.md +++ b/README.md @@ -55,16 +55,6 @@ Before starting, you should have installed: least C++17. - `tar` for extracting archives from Kaggle. -Building natively on Windows requires the Visual Studio 2012 Build Tools with the -optional Clang/LLVM C++ frontend (`clang-cl`). This can be installed from the -command line with -[`winget`](https://learn.microsoft.com/en-us/windows/package-manager/winget/): - -```sh -winget install --id Kitware.CMake -winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset" -``` - ### Step 1: Obtain model weights and tokenizer from Kaggle Visit [the Gemma model page on @@ -117,7 +107,6 @@ runtime, create a build directory and generate the build files using `cmake` from the top-level project directory. For the 8-bit switched floating point weights (sfp), run cmake with no options: -#### Unix-like Platforms ```sh cmake -B build ``` @@ -137,18 +126,17 @@ your weights, you can enter the `build/` directory and run `make` to build the `./gemma` executable: ```sh -# Configure `build` directory -cmake --preset make - -# Build project using make -cmake --build --preset make -j [number of parallel threads to use] +cd build +make -j [number of parallel threads to use] gemma ``` Replace `[number of parallel threads to use]` with a number - the number of -cores available on your system is a reasonable heuristic. For example, -`make -j4 gemma` will build using 4 threads. If the `nproc` command is -available, you can use `make -j$(nproc) gemma` as a reasonable default -for the number of threads. +cores available on your system is a reasonable heuristic. + +For example, `make -j4 gemma` will build using 4 threads. If this is successful, +you should now have a `gemma` executable in the `build/` directory. If the +`nproc` command is available, you can use `make -j$(nproc) gemma` as a +reasonable default for the number of threads. If you aren't sure of the right value for the `-j` flag, you can simply run `make gemma` instead and it should still build the `./gemma` executable. @@ -157,20 +145,6 @@ If you aren't sure of the right value for the `-j` flag, you can simply run > On Windows Subsystem for Linux (WSL) users should set the number of > parallel threads to 1. Using a larger number may result in errors. -If the build is successful, you should now have a `gemma` executable in the `build/` directory. - -#### Windows - -```sh -# Configure `build` directory -cmake --preset windows - -# Build project using Visual Studio Build Tools -cmake --build --preset windows -j [number of parallel threads to use] -``` - -If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory. - ### Step 4: Run You can now run `gemma` from inside the `build/` directory. diff --git a/compression/blob_store.cc b/compression/blob_store.cc index 550c727..8d6c1d0 100644 --- a/compression/blob_store.cc +++ b/compression/blob_store.cc @@ -16,16 +16,11 @@ // copybara:import_next_line:gemma_cpp #include "compression/blob_store.h" +#include // open #include #include // SEEK_END - unistd isn't enough for IDE. #include // O_RDONLY -#include // open -#if HWY_OS_WIN -#include // read, write, close -#include -#else -#include // read, write, close -#endif +#include // read, close #include #include @@ -35,54 +30,6 @@ #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/detect_compiler_arch.h" -namespace { -#if HWY_OS_WIN - -// pread is not supported on Windows -static int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) { - HANDLE file = reinterpret_cast(_get_osfhandle(fd)); - if (file == INVALID_HANDLE_VALUE) { - return -1; - } - - OVERLAPPED overlapped = {0}; - overlapped.Offset = offset & 0xFFFFFFFF; - overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF; - - DWORD bytes_read; - if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) { - if (GetLastError() != ERROR_HANDLE_EOF) { - return -1; - } - } - - return bytes_read; -} - -// pwrite is not supported on Windows -static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) { - HANDLE file = reinterpret_cast(_get_osfhandle(fd)); - if (file == INVALID_HANDLE_VALUE) { - return -1; - } - - OVERLAPPED overlapped = {0}; - overlapped.Offset = offset & 0xFFFFFFFF; - overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF; - - DWORD bytes_written; - if (!WriteFile(file, buf, size, &bytes_written, &overlapped)) { - if (GetLastError() != ERROR_HANDLE_EOF) { - return -1; - } - } - - return bytes_written; -} - -#endif -} - namespace gcpp { hwy::uint128_t MakeKey(const char* string) { @@ -117,30 +64,19 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data, } } - struct IO { // Returns size in bytes or 0. static uint64_t FileSize(const char* filename) { int fd = open(filename, O_RDONLY); - if (fd < 0) { - return 0; + if (fd >= 0) { + const off_t size = lseek(fd, 0, SEEK_END); + HWY_ASSERT(close(fd) != -1); + if (size != static_cast(-1)) { + return static_cast(size); + } } -#if HWY_OS_WIN - const int64_t size = _lseeki64(fd, 0, SEEK_END); - HWY_ASSERT(close(fd) != -1); - if (size < 0) { - return 0; - } -#else - const off_t size = lseek(fd, 0, SEEK_END); - HWY_ASSERT(close(fd) != -1); - if (size == static_cast(-1)) { - return 0; - } -#endif - - return static_cast(size); + return 0; } static bool Read(int fd, uint64_t offset, uint64_t size, void* to) { @@ -316,14 +252,7 @@ class BlobStore { #pragma pack(pop) BlobError BlobReader::Open(const char* filename) { -#if HWY_OS_WIN - DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN; - HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr); - if (file == INVALID_HANDLE_VALUE) return __LINE__; - fd_ = _open_osfhandle(reinterpret_cast(file), _O_RDONLY); -#else fd_ = open(filename, O_RDONLY); -#endif if (fd_ < 0) return __LINE__; #if _POSIX_C_SOURCE >= 200112L @@ -401,14 +330,7 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, keys_.data(), blobs_.data(), keys_.size()); // Create/replace existing file. -#if HWY_OS_WIN - DWORD flags = FILE_ATTRIBUTE_NORMAL; - HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr); - if (file == INVALID_HANDLE_VALUE) return __LINE__; - const int fd = _open_osfhandle(reinterpret_cast(file), _O_WRONLY); -#else const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644); -#endif if (fd < 0) return __LINE__; std::atomic_flag err = ATOMIC_FLAG_INIT; @@ -419,7 +341,6 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, err.test_and_set(); } }); - HWY_ASSERT(close(fd) != -1); if (err.test_and_set()) return __LINE__; return 0; } diff --git a/run.cc b/run.cc index 96ba316..87d8445 100644 --- a/run.cc +++ b/run.cc @@ -144,11 +144,6 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, return; } - if (prompt_string == "%c" || prompt_string == "%C") { - abs_pos = 0; - continue; - } - if (model.model_training == ModelTraining::GEMMA_IT) { // For instruction-tuned models: add control tokens. prompt_string = "user\n" + prompt_string + diff --git a/util/app.h b/util/app.h index bd665a4..966fa41 100644 --- a/util/app.h +++ b/util/app.h @@ -18,9 +18,7 @@ #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_ #define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_ -#if HWY_OS_LINUX #include -#endif #include #include // std::clamp From 1243be71c4aa692aa9e828e0aade04a5c5285bf6 Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Sun, 25 Feb 2024 04:33:39 -0800 Subject: [PATCH 07/26] Copybara import of the project: -- e0179bad839b808265948e0141feba0844264a9d by Dan Zheng : Rename BUILD to BUILD.bazel. This fixes an error on macOS due to `build` and `BUILD` having conflicting names. -- 74b27074e10b7fcca2cac42aaae3637bea39d11b by Dan Zheng : Enable macos-latest in GitHub Actions CI. -- c08de58e6a58f685d84c9112ca2e74d354ecee77 by Dan Zheng : Fix concurrency key in GitHub Actions. Use matrix configuration in concurrency key. COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/36 from dan-zheng:rename-build-bzl b4b978f02bee169ed83737af12714d1b66e3625d PiperOrigin-RevId: 610156681 --- .github/workflows/build.yml | 8 +++--- BUILD.bazel | 51 ------------------------------------- 2 files changed, 4 insertions(+), 55 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 929e140..b0d4b6e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,11 +1,11 @@ -name: Build +name: build # Trigger on push or via manual dispatch. on: [push, workflow_dispatch] jobs: build: - runs-on: ${{matrix.os}} + runs-on: ${{ matrix.os }} name: ${{ matrix.os }} ${{ matrix.type }} timeout-minutes: 30 @@ -13,10 +13,10 @@ jobs: fail-fast: false matrix: type: ['Release'] - os: ['ubuntu-latest'] + os: ['ubuntu-latest', 'macos-latest'] concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }} cancel-in-progress: true steps: diff --git a/BUILD.bazel b/BUILD.bazel index 190690b..18dad30 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -114,54 +114,3 @@ cc_binary( "//:thread_pool", ], ) - -# copybara:strip_begin -cc_binary( - name = "run_csv", - srcs = [ - "run_csv.cc", - ], - deps = [ - ":app", - ":args", - ":gemma_lib", - "//compression:compress", - # copybara:import_next_line:hwy - "//:hwy", - # copybara:import_next_line:hwy - "//:nanobenchmark", - # copybara:import_next_line:hwy - "//:profiler", - # copybara:import_next_line:hwy - "//:thread_pool", - "//third_party/riegeli/bytes:file_reader", - "//third_party/riegeli/bytes:file_writer", - "//third_party/riegeli/csv:csv_reader", - "//third_party/riegeli/csv:csv_writer", - ], -) - -gensignature( - name = "gemma_sign", - srcs = [":gemma"], -) - -cc_test( - name = "benchmarks", - size = "large", - srcs = [ - "benchmarks.cc", - ], - tags = ["notap"], - deps = [ - ":app", - ":gemma_lib", - "//third_party/benchmark", - # copybara:import_next_line:hwy - "//:hwy", - # copybara:import_next_line:hwy - "//:thread_pool", - ], -) - -# copybara:strip_end From 6a3085828f123737dfb929571329067ed49f789e Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Sun, 25 Feb 2024 19:08:50 -0800 Subject: [PATCH 08/26] Fixes #37, lambda issue: missing HWY_ATTR, and cannot capture SVE in/out vectors. PiperOrigin-RevId: 610260610 --- ops.h | 55 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/ops.h b/ops.h index db2ae4f..7619b44 100644 --- a/ops.h +++ b/ops.h @@ -214,7 +214,8 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x, size_t size) { namespace hn = hwy::HWY_NAMESPACE; using D = hn::ScalableTag; - hn::Transform(D(), x, size, [](D d, hn::Vec v) { return Gelu(d, v); }); + hn::Transform(D(), x, size, + [](D d, hn::Vec v) HWY_ATTR { return Gelu(d, v); }); } // out[i] = BF(mul[i] * Gelu(gelu_in[i])) @@ -567,22 +568,41 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, size_t size, namespace hn = hwy::HWY_NAMESPACE; using D = hn::ScalableTag; const D d; - using V = hn::Vec; + const size_t N = hn::Lanes(d); - // Find max so we can subtract it below. - const V vmin = hn::Set(d, hwy::LowestValue()); - V max = vmin; - hn::Foreach(d, x, mask_pos, vmin, - [&max](D d, V v) { max = hn::Max(max, v); }); - max = hn::MaxOfLanes(d, max); // broadcast + // Find max so we can subtract it below. Avoid hn::Foreach because SVE vectors + // cannot be lambda-captured. + // TODO(janwas): could be replaced with an hn::Accumulate algo. + const hn::Vec vmin = hn::Set(d, hwy::LowestValue()); + hn::Vec vmax = vmin; + size_t idx = 0; + if (mask_pos >= N) { + for (; idx <= mask_pos - N; idx += N) { + vmax = hn::Max(vmax, LoadU(d, x + idx)); + } + } + vmax = hn::Max(vmax, LoadNOr(vmin, d, x + idx, mask_pos - idx)); + vmax = hn::MaxOfLanes(d, vmax); // broadcast // Subtract max (avoid precision loss for large exponents) and exponentiate. - V sum = hn::Zero(d); - hn::Transform(d, x, mask_pos, [&sum, max](D d, V v) { - const V out = hn::Exp(d, hn::Sub(v, max)); + // Also avoid hn::Transform because the additional `sum` output vector cannot + // be captured by a lambda. + hn::Vec sum = hn::Zero(d); + idx = 0; + if (mask_pos >= N) { + for (; idx <= mask_pos - N; idx += N) { + const hn::Vec out = hn::Exp(d, hn::Sub(hn::LoadU(d, x + idx), vmax)); + sum = hn::Add(sum, out); + hn::StoreU(out, d, x + idx); + } + } + if (mask_pos > idx) { + const size_t remaining = mask_pos - idx; + const hn::Vec out = + hn::Exp(d, hn::Sub(hn::LoadN(d, x + idx, remaining), vmax)); sum = hn::Add(sum, out); - return out; - }); + hn::StoreN(out, d, x + idx, remaining); + } // Normalize to probability distribution const float mul = 1.0f / hn::ReduceSum(d, sum); @@ -601,13 +621,12 @@ static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x, namespace hn = hwy::HWY_NAMESPACE; using D = hn::ScalableTag; const D d; - using V = hn::Vec; - const V inv_cap = hn::Set(d, 1.0f / cap); - const V vcap = hn::Set(d, cap); + const float inv_cap = 1.0f / cap; - hn::Transform(d, x, size, [vcap, inv_cap](D d, hn::Vec v) { - return hn::Mul(vcap, hn::Tanh(d, hn::Mul(inv_cap, v))); + hn::Transform(d, x, size, [cap, inv_cap](D d, hn::Vec v) HWY_ATTR { + return hn::Mul(hn::Set(d, cap), + hn::Tanh(d, hn::Mul(v, hn::Set(d, inv_cap)))); }); } From 4c155bd3df70e45837a50a3d7496733ba47e000f Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Sun, 25 Feb 2024 19:31:27 -0800 Subject: [PATCH 09/26] Restore reverted changes. Sync to https://github.com/google/gemma.cpp/commit/84444c93a44f484442fda2523dde7e77dbd3a53c. PiperOrigin-RevId: 610263918 --- .github/workflows/build.yml | 4 +- CMakeLists.txt | 8 ++- CMakePresets.json | 59 ++++++++++++++++++++++ README.md | 42 +++++++++++++--- compression/blob_store.cc | 97 +++++++++++++++++++++++++++++++++---- run.cc | 5 ++ util/app.h | 2 + 7 files changed, 196 insertions(+), 21 deletions(-) create mode 100644 CMakePresets.json diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b0d4b6e..da63c1c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,7 +1,7 @@ name: build -# Trigger on push or via manual dispatch. -on: [push, workflow_dispatch] +# Trigger on push, pull request, or via manual dispatch. +on: [push, pull_request, workflow_dispatch] jobs: build: diff --git a/CMakeLists.txt b/CMakeLists.txt index 722e408..c7828cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f) FetchContent_MakeAvailable(highway) -## Note: absl meeds tp be installed by sentencepiece. This will only happen if +## Note: absl needs to be installed by sentencepiece. This will only happen if ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c) FetchContent_MakeAvailable(sentencepiece) @@ -49,7 +49,7 @@ endif() # Allowable types for WEIGHT_TYPE: # float - slow, not recommended -# hwy::bfloat16_t - bfloat16 as impemented by https://github.com/google/highway +# hwy::bfloat16_t - bfloat16 as implemented by https://github.com/google/highway # SfpStream - 8-bit switched floating point (recommended) # NuqStream - experimental, work-in-progress option(WEIGHT_TYPE "Set weight type" "") @@ -67,6 +67,8 @@ target_link_libraries(gemma hwy hwy_contrib sentencepiece) target_include_directories(gemma PRIVATE ./) FetchContent_GetProperties(sentencepiece) target_include_directories(gemma PRIVATE ${sentencepiece_SOURCE_DIR}) +target_compile_definitions(gemma PRIVATE $<$:_CRT_SECURE_NO_WARNINGS NOMINMAX>) +target_compile_options(gemma PRIVATE $<$:-Wno-deprecated-declarations>) ## Library Target @@ -76,3 +78,5 @@ set_target_properties(libgemma PROPERTIES PREFIX "") target_include_directories(libgemma PUBLIC ./) target_link_libraries(libgemma hwy hwy_contrib sentencepiece) target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR}) +target_compile_definitions(libgemma PRIVATE $<$:_CRT_SECURE_NO_WARNINGS NOMINMAX>) +target_compile_options(libgemma PRIVATE $<$:-Wno-deprecated-declarations>) diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 0000000..5fe13c8 --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,59 @@ +{ + "version": 3, + "cmakeMinimumRequired": { + "major": 3, + "minor": 11, + "patch": 0 + }, + "configurePresets": [ + { + "name": "__defaults__", + "hidden": true, + "binaryDir": "${sourceDir}/build" + }, + { + "name": "make", + "inherits": "__defaults__", + "displayName": "Make", + "description": "Unix Makefiles", + "generator": "Unix Makefiles", + "binaryDir": "${sourceDir}/build" + }, + { + "name": "windows", + "inherits": "__defaults__", + "displayName": "Windows", + "description": "Visual Studio 2022 with Clang/LLVM frontend", + "generator": "Visual Studio 17 2022", + "toolset": "ClangCL", + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + } + ], + "buildPresets": [ + { + "name": "__defaults__", + "hidden": true, + "targets": [ + "gemma", + "libgemma" + ] + }, + { + "name": "make", + "inherits": "__defaults__", + "displayName": "Unix Makefiles", + "configurePreset": "make" + }, + { + "name": "windows", + "inherits": "__defaults__", + "displayName": "Windows", + "configuration": "Release", + "configurePreset": "windows" + } + ] + } diff --git a/README.md b/README.md index e278833..d31bbaf 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,16 @@ Before starting, you should have installed: least C++17. - `tar` for extracting archives from Kaggle. +Building natively on Windows requires the Visual Studio 2012 Build Tools with the +optional Clang/LLVM C++ frontend (`clang-cl`). This can be installed from the +command line with +[`winget`](https://learn.microsoft.com/en-us/windows/package-manager/winget/): + +```sh +winget install --id Kitware.CMake +winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset" +``` + ### Step 1: Obtain model weights and tokenizer from Kaggle Visit [the Gemma model page on @@ -107,6 +117,7 @@ runtime, create a build directory and generate the build files using `cmake` from the top-level project directory. For the 8-bit switched floating point weights (sfp), run cmake with no options: +#### Unix-like Platforms ```sh cmake -B build ``` @@ -126,17 +137,18 @@ your weights, you can enter the `build/` directory and run `make` to build the `./gemma` executable: ```sh -cd build -make -j [number of parallel threads to use] gemma +# Configure `build` directory +cmake --preset make + +# Build project using make +cmake --build --preset make -j [number of parallel threads to use] ``` Replace `[number of parallel threads to use]` with a number - the number of -cores available on your system is a reasonable heuristic. - -For example, `make -j4 gemma` will build using 4 threads. If this is successful, -you should now have a `gemma` executable in the `build/` directory. If the -`nproc` command is available, you can use `make -j$(nproc) gemma` as a -reasonable default for the number of threads. +cores available on your system is a reasonable heuristic. For example, +`make -j4 gemma` will build using 4 threads. If the `nproc` command is +available, you can use `make -j$(nproc) gemma` as a reasonable default +for the number of threads. If you aren't sure of the right value for the `-j` flag, you can simply run `make gemma` instead and it should still build the `./gemma` executable. @@ -145,6 +157,20 @@ If you aren't sure of the right value for the `-j` flag, you can simply run > On Windows Subsystem for Linux (WSL) users should set the number of > parallel threads to 1. Using a larger number may result in errors. +If the build is successful, you should now have a `gemma` executable in the `build/` directory. + +#### Windows + +```sh +# Configure `build` directory +cmake --preset windows + +# Build project using Visual Studio Build Tools +cmake --build --preset windows -j [number of parallel threads to use] +``` + +If the build is successful, you should now have a `gemma.exe` executable in the `build/` directory. + ### Step 4: Run You can now run `gemma` from inside the `build/` directory. diff --git a/compression/blob_store.cc b/compression/blob_store.cc index 8d6c1d0..550c727 100644 --- a/compression/blob_store.cc +++ b/compression/blob_store.cc @@ -16,11 +16,16 @@ // copybara:import_next_line:gemma_cpp #include "compression/blob_store.h" -#include // open #include #include // SEEK_END - unistd isn't enough for IDE. #include // O_RDONLY -#include // read, close +#include // open +#if HWY_OS_WIN +#include // read, write, close +#include +#else +#include // read, write, close +#endif #include #include @@ -30,6 +35,54 @@ #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/detect_compiler_arch.h" +namespace { +#if HWY_OS_WIN + +// pread is not supported on Windows +static int64_t pread(int fd, void* buf, uint64_t size, uint64_t offset) { + HANDLE file = reinterpret_cast(_get_osfhandle(fd)); + if (file == INVALID_HANDLE_VALUE) { + return -1; + } + + OVERLAPPED overlapped = {0}; + overlapped.Offset = offset & 0xFFFFFFFF; + overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF; + + DWORD bytes_read; + if (!ReadFile(file, buf, size, &bytes_read, &overlapped)) { + if (GetLastError() != ERROR_HANDLE_EOF) { + return -1; + } + } + + return bytes_read; +} + +// pwrite is not supported on Windows +static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) { + HANDLE file = reinterpret_cast(_get_osfhandle(fd)); + if (file == INVALID_HANDLE_VALUE) { + return -1; + } + + OVERLAPPED overlapped = {0}; + overlapped.Offset = offset & 0xFFFFFFFF; + overlapped.OffsetHigh = (offset >> 32) & 0xFFFFFFFF; + + DWORD bytes_written; + if (!WriteFile(file, buf, size, &bytes_written, &overlapped)) { + if (GetLastError() != ERROR_HANDLE_EOF) { + return -1; + } + } + + return bytes_written; +} + +#endif +} + namespace gcpp { hwy::uint128_t MakeKey(const char* string) { @@ -64,19 +117,30 @@ static void EnqueueChunkRequests(uint64_t offset, uint64_t size, uint8_t* data, } } + struct IO { // Returns size in bytes or 0. static uint64_t FileSize(const char* filename) { int fd = open(filename, O_RDONLY); - if (fd >= 0) { - const off_t size = lseek(fd, 0, SEEK_END); - HWY_ASSERT(close(fd) != -1); - if (size != static_cast(-1)) { - return static_cast(size); - } + if (fd < 0) { + return 0; } - return 0; +#if HWY_OS_WIN + const int64_t size = _lseeki64(fd, 0, SEEK_END); + HWY_ASSERT(close(fd) != -1); + if (size < 0) { + return 0; + } +#else + const off_t size = lseek(fd, 0, SEEK_END); + HWY_ASSERT(close(fd) != -1); + if (size == static_cast(-1)) { + return 0; + } +#endif + + return static_cast(size); } static bool Read(int fd, uint64_t offset, uint64_t size, void* to) { @@ -252,7 +316,14 @@ class BlobStore { #pragma pack(pop) BlobError BlobReader::Open(const char* filename) { +#if HWY_OS_WIN + DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN; + HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr); + if (file == INVALID_HANDLE_VALUE) return __LINE__; + fd_ = _open_osfhandle(reinterpret_cast(file), _O_RDONLY); +#else fd_ = open(filename, O_RDONLY); +#endif if (fd_ < 0) return __LINE__; #if _POSIX_C_SOURCE >= 200112L @@ -330,7 +401,14 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, keys_.data(), blobs_.data(), keys_.size()); // Create/replace existing file. +#if HWY_OS_WIN + DWORD flags = FILE_ATTRIBUTE_NORMAL; + HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr); + if (file == INVALID_HANDLE_VALUE) return __LINE__; + const int fd = _open_osfhandle(reinterpret_cast(file), _O_WRONLY); +#else const int fd = open(filename, O_CREAT | O_RDWR | O_TRUNC, 0644); +#endif if (fd < 0) return __LINE__; std::atomic_flag err = ATOMIC_FLAG_INIT; @@ -341,6 +419,7 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, err.test_and_set(); } }); + HWY_ASSERT(close(fd) != -1); if (err.test_and_set()) return __LINE__; return 0; } diff --git a/run.cc b/run.cc index 87d8445..96ba316 100644 --- a/run.cc +++ b/run.cc @@ -144,6 +144,11 @@ void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, return; } + if (prompt_string == "%c" || prompt_string == "%C") { + abs_pos = 0; + continue; + } + if (model.model_training == ModelTraining::GEMMA_IT) { // For instruction-tuned models: add control tokens. prompt_string = "user\n" + prompt_string + diff --git a/util/app.h b/util/app.h index 966fa41..bd665a4 100644 --- a/util/app.h +++ b/util/app.h @@ -18,7 +18,9 @@ #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_ #define THIRD_PARTY_GEMMA_CPP_UTIL_APP_H_ +#if HWY_OS_LINUX #include +#endif #include #include // std::clamp From 4e2efbcbd89bd634a995f8265c80677334410f01 Mon Sep 17 00:00:00 2001 From: Kewde Date: Mon, 26 Feb 2024 08:30:21 -0800 Subject: [PATCH 10/26] Copybara import of the project: -- f4f2ff3c1a13fce546112d329419b211eb2be8b1 by kewde : fix: add -fPIC to libgemma COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/42 from kewde:kewde/enable-fpic f4f2ff3c1a13fce546112d329419b211eb2be8b1 PiperOrigin-RevId: 610416597 --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c7828cc..308e258 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,7 @@ target_compile_options(gemma PRIVATE $<$:-Wno-deprecated-de add_library(libgemma ${SOURCES}) set_property(TARGET libgemma PROPERTY CXX_STANDARD 17) set_target_properties(libgemma PROPERTIES PREFIX "") +set_property(TARGET libgemma PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(libgemma PUBLIC ./) target_link_libraries(libgemma hwy hwy_contrib sentencepiece) target_include_directories(libgemma PRIVATE ${sentencepiece_SOURCE_DIR}) From 7ab968c957a74b5b0c312212089fc878a310c245 Mon Sep 17 00:00:00 2001 From: Naoki Kishida Date: Mon, 26 Feb 2024 08:38:49 -0800 Subject: [PATCH 11/26] Copybara import of the project: -- 26b541b666a5860ced67a3df7630b6364eedd8cb by kishida : add information for the reseting conversation COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/40 from kishida:add_info_for_reset_conv 26b541b666a5860ced67a3df7630b6364eedd8cb PiperOrigin-RevId: 610418671 --- README.md | 2 +- run.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d31bbaf..5932726 100644 --- a/README.md +++ b/README.md @@ -273,7 +273,7 @@ max_tokens : 3072 max_generated_tokens : 2048 *Usage* - Enter an instruction and press enter (%Q quits). + Enter an instruction and press enter (%C reset conversation, %Q quits). *Examples* - Write an email to grandma thanking her for the cookies. diff --git a/run.cc b/run.cc index 96ba316..f83ead9 100644 --- a/run.cc +++ b/run.cc @@ -221,7 +221,8 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { const std::string instructions = "*Usage*\n" - " Enter an instruction and press enter (%Q quits).\n\n" + " Enter an instruction and press enter (%C reset conversation, " + "%Q quits).\n\n" "*Examples*\n" " - Write an email to grandma thanking her for the cookies.\n" " - What are some historical attractions to visit around " From 7aeade5c9d026eece614503fd704dcfc5ad0d625 Mon Sep 17 00:00:00 2001 From: David Coles Date: Mon, 26 Feb 2024 10:22:24 -0800 Subject: [PATCH 12/26] Copybara import of the project: -- c64b6fd3a44b385e1502d2057bd8709edaebaa58 by David Coles : Include Windows in GitHub Actions build This also preserves the `gemma` binary as a build artefact should folks want to grab a pre-built binary. Dropped the use of the lukka/cmake actions due to conflicts with `--preset`. This isn't that bad as we were mostly overriding the default behaviour anyway. It also shaves ~2 min off the build since the GitHub builders already have CMake pre-installed. COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gemma.cpp/pull/38 from dcoles:windows-build c64b6fd3a44b385e1502d2057bd8709edaebaa58 PiperOrigin-RevId: 610449220 --- .github/workflows/build.yml | 49 +++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da63c1c..82b9152 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,17 +6,25 @@ on: [push, pull_request, workflow_dispatch] jobs: build: runs-on: ${{ matrix.os }} - name: ${{ matrix.os }} ${{ matrix.type }} + name: ${{ matrix.os }} (${{ matrix.preset }}) ${{ matrix.build_type }} timeout-minutes: 30 strategy: fail-fast: false matrix: - type: ['Release'] - os: ['ubuntu-latest', 'macos-latest'] + os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] + build_type: ['Release'] + preset: ['make', 'windows'] + exclude: + - os: ubuntu-latest + preset: windows + - os: macos-latest + preset: windows + - os: windows-latest + preset: make concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.type }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.preset }}-${{ matrix.build_type }} cancel-in-progress: true steps: @@ -26,20 +34,23 @@ jobs: - name: ccache uses: hendrikmuhs/ccache-action@v1.2 - # Install CMake - - uses: lukka/get-cmake@latest + - name: Configure CMake + run: > + cmake --preset ${{ matrix.preset }} + -S ${{ github.workspace }} -B ${{ github.workspace }}/build + -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} + -D CMAKE_C_COMPILER_LAUNCHER=ccache + -D CMAKE_CXX_COMPILER_LAUNCHER=ccache - # Build via CMake - # Reference: https://github.com/lukka/run-cmake/blob/v3/action.yml - - name: Build via cmake - uses: lukka/run-cmake@v3 + - name: Build + run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }} + + - name: Archive production artifacts + uses: actions/upload-artifact@v4 with: - cmakeListsOrSettingsJson: CMakeListsTxtAdvanced - cmakeAppendedArgs: > - -D CMAKE_C_COMPILER_LAUNCHER=ccache - -D CMAKE_CXX_COMPILER_LAUNCHER=ccache - buildWithCMake: true - # Explicitly list build targets here. - # Building "all" includes test executables and takes much longer. - buildWithCMakeArgs: "-- gemma" - buildDirectory: '${{ github.workspace }}/build' + name: gemma-${{ matrix.os }}-${{ matrix.preset }}-${{ matrix.build_type }} + path: | + ${{ github.workspace }}/build/${{ matrix.build_type }}/gemma.exe + ${{ github.workspace }}/build/${{ matrix.build_type }}/libgemma.lib + ${{ github.workspace }}/build/gemma + ${{ github.workspace }}/build/libgemma.a From 129e66ada2b4e461bdf28b88b70cd2465cb213e4 Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Mon, 26 Feb 2024 17:05:32 -0500 Subject: [PATCH 13/26] Reduce KV cache preallocation to 4096 and make it comptime configurable, add rm build note in readme, add note on comptime options in DEVELOPERS, make multiturn=0 the default --- DEVELOPERS.md | 18 ++++++++++++++++ README.md | 8 +++++-- configs.h | 21 +++++++++++------- gemma.h | 59 ++++++++++++++++++++++++--------------------------- 4 files changed, 65 insertions(+), 41 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index d06b0f8..bdc02c0 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -70,3 +70,21 @@ The implementation code is roughly split into 4 layers, from high to low level: 4. Backend (`highway`) - Low-level hardware interface (SIMD in the case of highway) supporting the implementations in (3). + +## Compile-Time Flags (Advanced) + +There are several compile-time flags to be aware of (note these may or may not +be exposed to the build system): + +- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as + WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` + (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to + enable for higher-fidelity (but slower) bfloat16 support. This is defined in + `gemma.h`. +- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV + Cache. The default is 4096 tokens but can be overridden. This is not exposed + through `CMakeLists.txt` yet. + +In the medium term both of these will likely be deprecated in favor of handling +options at runtime - allowing for multiple weight compression schemes in a single +build and dynamically resizes the KV cache as needed. diff --git a/README.md b/README.md index 5932726..8db6862 100644 --- a/README.md +++ b/README.md @@ -114,8 +114,12 @@ convenient directory location (e.g. the `build/` directory in this repo). The build system uses [CMake](https://cmake.org/). To build the gemma inference runtime, create a build directory and generate the build files using `cmake` -from the top-level project directory. For the 8-bit switched floating point -weights (sfp), run cmake with no options: +from the top-level project directory. Note if you previous ran `cmake` and are +re-running with a different setting, be sure to clean out the `build/` directory +with `rm -rf build/*` (warning this will delete any other files in the `build/` +directory. + +For the 8-bit switched floating point weights (sfp), run cmake with no options: #### Unix-like Platforms ```sh diff --git a/configs.h b/configs.h index ebe6220..4be5f75 100644 --- a/configs.h +++ b/configs.h @@ -18,21 +18,26 @@ #ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ #define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ +// Allow changing pre-allocated kv cache size as a compiler flag +#ifndef GEMMA_MAX_SEQLEN +#define GEMMA_MAX_SEQLEN 4096 +#endif // !GEMMA_MAX_SEQLEN + #include namespace gcpp { -static constexpr size_t kSeqLen = 7168; +static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN; struct ConfigGemma7B { static constexpr int kSeqLen = gcpp::kSeqLen; static constexpr int kVocabSize = 256128; static constexpr int kLayers = 28; static constexpr int kModelDim = 3072; - static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 + static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 static constexpr int kHeads = 16; - static constexpr int kKVHeads = 16; // standard MHA, no GQA or MQA - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kKVHeads = 16; // standard MHA + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; @@ -41,13 +46,13 @@ struct ConfigGemma2B { static constexpr int kVocabSize = 256128; static constexpr int kLayers = 18; static constexpr int kModelDim = 2048; - static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 + static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 static constexpr int kHeads = 8; static constexpr int kKVHeads = 8; // TODO(austinvhuang): add MQA support - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ +#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ diff --git a/gemma.h b/gemma.h index 5dc9f62..f2a130e 100644 --- a/gemma.h +++ b/gemma.h @@ -25,14 +25,14 @@ #include // copybara:import_next_line:gemma_cpp -#include "compression/compress.h" // SfpStream/NuqStream +#include "compression/compress.h" // SfpStream/NuqStream // copybara:import_next_line:gemma_cpp -#include "configs.h" // kSeqLen +#include "configs.h" // kSeqLen // copybara:import_next_line:gemma_cpp -#include "util/args.h" // ArgsBase #include "hwy/aligned_allocator.h" -#include "hwy/base.h" // hwy::bfloat16_t +#include "hwy/base.h" // hwy::bfloat16_t #include "hwy/contrib/thread_pool/thread_pool.h" +#include "util/args.h" // ArgsBase // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" @@ -42,7 +42,7 @@ namespace gcpp { // float, hwy::bfloat16_t, SfpStream, NuqStream #ifndef GEMMA_WEIGHT_T #define GEMMA_WEIGHT_T SfpStream -#endif // !GEMMA_WEIGHT_T +#endif // !GEMMA_WEIGHT_T using WeightT = GEMMA_WEIGHT_T; using EmbedderInputT = hwy::bfloat16_t; @@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false; struct KVCache { hwy::AlignedFreeUniquePtr - key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim hwy::AlignedFreeUniquePtr - value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim }; // Model variants: see configs.h for details. @@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B }; enum class ModelTraining { GEMMA_IT, GEMMA_PT }; struct LoaderArgs : public ArgsBase { - LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } + LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } - static std::string ToLower(const std::string& text) { + static std::string ToLower(const std::string &text) { std::string result = text; std::transform(begin(result), end(result), begin(result), [](unsigned char c) { return std::tolower(c); }); @@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase { } // Returns error string or nullptr if OK. - const char* Validate() const { + const char *Validate() const { const std::string model_type_lc = ToLower(model_type); if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" && model_type_lc != "2b-it" && model_type_lc != "7b-it") { @@ -111,12 +111,11 @@ struct LoaderArgs : public ArgsBase { } Path tokenizer; - Path model; // uncompressed weights OR - Path cache; // compressed weights + Path model; // uncompressed weights OR + Path cache; // compressed weights std::string model_type; - template - void ForEach(const Visitor& visitor) { + template void ForEach(const Visitor &visitor) { visitor(tokenizer, "tokenizer", Path(), "Path name of tokenizer model file. (required)"); visitor( @@ -139,10 +138,10 @@ struct LoaderArgs : public ArgsBase { struct GemmaInterface; struct Gemma { - Gemma(const LoaderArgs& args, hwy::ThreadPool& pool); - ~Gemma(); // must be defined after GemmaInterface's dtor is defined. + Gemma(const LoaderArgs &args, hwy::ThreadPool &pool); + ~Gemma(); // must be defined after GemmaInterface's dtor is defined. - const sentencepiece::SentencePieceProcessor& Tokenizer() const; + const sentencepiece::SentencePieceProcessor &Tokenizer() const; std::unique_ptr impl_; gcpp::ModelTraining model_training; @@ -154,7 +153,7 @@ using StreamFunc = std::function; using AcceptFunc = std::function; struct InferenceArgs : public ArgsBase { - InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } + InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } size_t max_tokens; size_t max_generated_tokens; @@ -164,7 +163,7 @@ struct InferenceArgs : public ArgsBase { bool multiturn; // Returns error string or nullptr if OK. - const char* Validate() const { + const char *Validate() const { if (max_tokens > gcpp::kSeqLen) { return "max_tokens is larger than the maximum sequence length (see " "configs.h)."; @@ -176,8 +175,7 @@ struct InferenceArgs : public ArgsBase { return nullptr; } - template - void ForEach(const Visitor& visitor) { + template void ForEach(const Visitor &visitor) { visitor(max_tokens, "max_tokens", size_t{3072}, "Maximum number of tokens in prompt + generation."); visitor(max_generated_tokens, "max_generated_tokens", size_t{2048}, @@ -186,22 +184,21 @@ struct InferenceArgs : public ArgsBase { visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2); visitor(deterministic, "deterministic", false, "Make top-k sampling deterministic", 2); - visitor(multiturn, "multiturn", true, + visitor(multiturn, "multiturn", false, "Multiturn mode (if 0, this clears the KV cache after every " - "interaction without quitting)", - 2); + "interaction without quitting)\n Default = 0 (conversation resets every turn)"); } }; -void GenerateGemma(Gemma& gemma, const InferenceArgs& args, - const std::vector& prompt, size_t start_pos, - hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, - const StreamFunc& stream_token, - const AcceptFunc& accept_token, std::mt19937& g, +void GenerateGemma(Gemma &gemma, const InferenceArgs &args, + const std::vector &prompt, size_t start_pos, + hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool, + const StreamFunc &stream_token, + const AcceptFunc &accept_token, std::mt19937 &g, int verbosity); constexpr int EOS_ID = 1; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ +#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ From 8db89304bdc21949911c9e8996c03d3a623e7a6d Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Mon, 26 Feb 2024 12:54:39 -0800 Subject: [PATCH 14/26] No public description PiperOrigin-RevId: 610498969 --- DEVELOPERS.md | 18 ---------------- README.md | 8 ++----- configs.h | 21 +++++++----------- gemma.h | 59 +++++++++++++++++++++++++++------------------------ 4 files changed, 41 insertions(+), 65 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index bdc02c0..d06b0f8 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -70,21 +70,3 @@ The implementation code is roughly split into 4 layers, from high to low level: 4. Backend (`highway`) - Low-level hardware interface (SIMD in the case of highway) supporting the implementations in (3). - -## Compile-Time Flags (Advanced) - -There are several compile-time flags to be aware of (note these may or may not -be exposed to the build system): - -- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as - WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` - (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to - enable for higher-fidelity (but slower) bfloat16 support. This is defined in - `gemma.h`. -- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV - Cache. The default is 4096 tokens but can be overridden. This is not exposed - through `CMakeLists.txt` yet. - -In the medium term both of these will likely be deprecated in favor of handling -options at runtime - allowing for multiple weight compression schemes in a single -build and dynamically resizes the KV cache as needed. diff --git a/README.md b/README.md index 8db6862..5932726 100644 --- a/README.md +++ b/README.md @@ -114,12 +114,8 @@ convenient directory location (e.g. the `build/` directory in this repo). The build system uses [CMake](https://cmake.org/). To build the gemma inference runtime, create a build directory and generate the build files using `cmake` -from the top-level project directory. Note if you previous ran `cmake` and are -re-running with a different setting, be sure to clean out the `build/` directory -with `rm -rf build/*` (warning this will delete any other files in the `build/` -directory. - -For the 8-bit switched floating point weights (sfp), run cmake with no options: +from the top-level project directory. For the 8-bit switched floating point +weights (sfp), run cmake with no options: #### Unix-like Platforms ```sh diff --git a/configs.h b/configs.h index 4be5f75..ebe6220 100644 --- a/configs.h +++ b/configs.h @@ -18,26 +18,21 @@ #ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ #define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ -// Allow changing pre-allocated kv cache size as a compiler flag -#ifndef GEMMA_MAX_SEQLEN -#define GEMMA_MAX_SEQLEN 4096 -#endif // !GEMMA_MAX_SEQLEN - #include namespace gcpp { -static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN; +static constexpr size_t kSeqLen = 7168; struct ConfigGemma7B { static constexpr int kSeqLen = gcpp::kSeqLen; static constexpr int kVocabSize = 256128; static constexpr int kLayers = 28; static constexpr int kModelDim = 3072; - static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 + static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 static constexpr int kHeads = 16; - static constexpr int kKVHeads = 16; // standard MHA - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kKVHeads = 16; // standard MHA, no GQA or MQA + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; @@ -46,13 +41,13 @@ struct ConfigGemma2B { static constexpr int kVocabSize = 256128; static constexpr int kLayers = 18; static constexpr int kModelDim = 2048; - static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 + static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 static constexpr int kHeads = 8; static constexpr int kKVHeads = 8; // TODO(austinvhuang): add MQA support - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ +#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ diff --git a/gemma.h b/gemma.h index f2a130e..5dc9f62 100644 --- a/gemma.h +++ b/gemma.h @@ -25,14 +25,14 @@ #include // copybara:import_next_line:gemma_cpp -#include "compression/compress.h" // SfpStream/NuqStream +#include "compression/compress.h" // SfpStream/NuqStream // copybara:import_next_line:gemma_cpp -#include "configs.h" // kSeqLen +#include "configs.h" // kSeqLen // copybara:import_next_line:gemma_cpp +#include "util/args.h" // ArgsBase #include "hwy/aligned_allocator.h" -#include "hwy/base.h" // hwy::bfloat16_t +#include "hwy/base.h" // hwy::bfloat16_t #include "hwy/contrib/thread_pool/thread_pool.h" -#include "util/args.h" // ArgsBase // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" @@ -42,7 +42,7 @@ namespace gcpp { // float, hwy::bfloat16_t, SfpStream, NuqStream #ifndef GEMMA_WEIGHT_T #define GEMMA_WEIGHT_T SfpStream -#endif // !GEMMA_WEIGHT_T +#endif // !GEMMA_WEIGHT_T using WeightT = GEMMA_WEIGHT_T; using EmbedderInputT = hwy::bfloat16_t; @@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false; struct KVCache { hwy::AlignedFreeUniquePtr - key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim hwy::AlignedFreeUniquePtr - value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim }; // Model variants: see configs.h for details. @@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B }; enum class ModelTraining { GEMMA_IT, GEMMA_PT }; struct LoaderArgs : public ArgsBase { - LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } + LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } - static std::string ToLower(const std::string &text) { + static std::string ToLower(const std::string& text) { std::string result = text; std::transform(begin(result), end(result), begin(result), [](unsigned char c) { return std::tolower(c); }); @@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase { } // Returns error string or nullptr if OK. - const char *Validate() const { + const char* Validate() const { const std::string model_type_lc = ToLower(model_type); if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" && model_type_lc != "2b-it" && model_type_lc != "7b-it") { @@ -111,11 +111,12 @@ struct LoaderArgs : public ArgsBase { } Path tokenizer; - Path model; // uncompressed weights OR - Path cache; // compressed weights + Path model; // uncompressed weights OR + Path cache; // compressed weights std::string model_type; - template void ForEach(const Visitor &visitor) { + template + void ForEach(const Visitor& visitor) { visitor(tokenizer, "tokenizer", Path(), "Path name of tokenizer model file. (required)"); visitor( @@ -138,10 +139,10 @@ struct LoaderArgs : public ArgsBase { struct GemmaInterface; struct Gemma { - Gemma(const LoaderArgs &args, hwy::ThreadPool &pool); - ~Gemma(); // must be defined after GemmaInterface's dtor is defined. + Gemma(const LoaderArgs& args, hwy::ThreadPool& pool); + ~Gemma(); // must be defined after GemmaInterface's dtor is defined. - const sentencepiece::SentencePieceProcessor &Tokenizer() const; + const sentencepiece::SentencePieceProcessor& Tokenizer() const; std::unique_ptr impl_; gcpp::ModelTraining model_training; @@ -153,7 +154,7 @@ using StreamFunc = std::function; using AcceptFunc = std::function; struct InferenceArgs : public ArgsBase { - InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } + InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } size_t max_tokens; size_t max_generated_tokens; @@ -163,7 +164,7 @@ struct InferenceArgs : public ArgsBase { bool multiturn; // Returns error string or nullptr if OK. - const char *Validate() const { + const char* Validate() const { if (max_tokens > gcpp::kSeqLen) { return "max_tokens is larger than the maximum sequence length (see " "configs.h)."; @@ -175,7 +176,8 @@ struct InferenceArgs : public ArgsBase { return nullptr; } - template void ForEach(const Visitor &visitor) { + template + void ForEach(const Visitor& visitor) { visitor(max_tokens, "max_tokens", size_t{3072}, "Maximum number of tokens in prompt + generation."); visitor(max_generated_tokens, "max_generated_tokens", size_t{2048}, @@ -184,21 +186,22 @@ struct InferenceArgs : public ArgsBase { visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2); visitor(deterministic, "deterministic", false, "Make top-k sampling deterministic", 2); - visitor(multiturn, "multiturn", false, + visitor(multiturn, "multiturn", true, "Multiturn mode (if 0, this clears the KV cache after every " - "interaction without quitting)\n Default = 0 (conversation resets every turn)"); + "interaction without quitting)", + 2); } }; -void GenerateGemma(Gemma &gemma, const InferenceArgs &args, - const std::vector &prompt, size_t start_pos, - hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool, - const StreamFunc &stream_token, - const AcceptFunc &accept_token, std::mt19937 &g, +void GenerateGemma(Gemma& gemma, const InferenceArgs& args, + const std::vector& prompt, size_t start_pos, + hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, + const StreamFunc& stream_token, + const AcceptFunc& accept_token, std::mt19937& g, int verbosity); constexpr int EOS_ID = 1; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ +#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ From afc354dcb19574fe1ddafaf7ffadb9292b3871ab Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Mon, 26 Feb 2024 19:04:33 -0800 Subject: [PATCH 15/26] Import from GitHub. PiperOrigin-RevId: 610595796 --- DEVELOPERS.md | 18 ++++++++++++++++++ README.md | 8 ++++++-- configs.h | 21 +++++++++++++-------- gemma.h | 51 ++++++++++++++++++++++++--------------------------- 4 files changed, 61 insertions(+), 37 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index d06b0f8..bdc02c0 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -70,3 +70,21 @@ The implementation code is roughly split into 4 layers, from high to low level: 4. Backend (`highway`) - Low-level hardware interface (SIMD in the case of highway) supporting the implementations in (3). + +## Compile-Time Flags (Advanced) + +There are several compile-time flags to be aware of (note these may or may not +be exposed to the build system): + +- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as + WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` + (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to + enable for higher-fidelity (but slower) bfloat16 support. This is defined in + `gemma.h`. +- `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV + Cache. The default is 4096 tokens but can be overridden. This is not exposed + through `CMakeLists.txt` yet. + +In the medium term both of these will likely be deprecated in favor of handling +options at runtime - allowing for multiple weight compression schemes in a single +build and dynamically resizes the KV cache as needed. diff --git a/README.md b/README.md index 5932726..8db6862 100644 --- a/README.md +++ b/README.md @@ -114,8 +114,12 @@ convenient directory location (e.g. the `build/` directory in this repo). The build system uses [CMake](https://cmake.org/). To build the gemma inference runtime, create a build directory and generate the build files using `cmake` -from the top-level project directory. For the 8-bit switched floating point -weights (sfp), run cmake with no options: +from the top-level project directory. Note if you previous ran `cmake` and are +re-running with a different setting, be sure to clean out the `build/` directory +with `rm -rf build/*` (warning this will delete any other files in the `build/` +directory. + +For the 8-bit switched floating point weights (sfp), run cmake with no options: #### Unix-like Platforms ```sh diff --git a/configs.h b/configs.h index ebe6220..4be5f75 100644 --- a/configs.h +++ b/configs.h @@ -18,21 +18,26 @@ #ifndef THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ #define THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ +// Allow changing pre-allocated kv cache size as a compiler flag +#ifndef GEMMA_MAX_SEQLEN +#define GEMMA_MAX_SEQLEN 4096 +#endif // !GEMMA_MAX_SEQLEN + #include namespace gcpp { -static constexpr size_t kSeqLen = 7168; +static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN; struct ConfigGemma7B { static constexpr int kSeqLen = gcpp::kSeqLen; static constexpr int kVocabSize = 256128; static constexpr int kLayers = 28; static constexpr int kModelDim = 3072; - static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 + static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 static constexpr int kHeads = 16; - static constexpr int kKVHeads = 16; // standard MHA, no GQA or MQA - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kKVHeads = 16; // standard MHA + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; @@ -41,13 +46,13 @@ struct ConfigGemma2B { static constexpr int kVocabSize = 256128; static constexpr int kLayers = 18; static constexpr int kModelDim = 2048; - static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 + static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 static constexpr int kHeads = 8; static constexpr int kKVHeads = 8; // TODO(austinvhuang): add MQA support - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ +#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ diff --git a/gemma.h b/gemma.h index 5dc9f62..1e76a37 100644 --- a/gemma.h +++ b/gemma.h @@ -42,7 +42,7 @@ namespace gcpp { // float, hwy::bfloat16_t, SfpStream, NuqStream #ifndef GEMMA_WEIGHT_T #define GEMMA_WEIGHT_T SfpStream -#endif // !GEMMA_WEIGHT_T +#endif // !GEMMA_WEIGHT_T using WeightT = GEMMA_WEIGHT_T; using EmbedderInputT = hwy::bfloat16_t; @@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false; struct KVCache { hwy::AlignedFreeUniquePtr - key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim hwy::AlignedFreeUniquePtr - value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim }; // Model variants: see configs.h for details. @@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B }; enum class ModelTraining { GEMMA_IT, GEMMA_PT }; struct LoaderArgs : public ArgsBase { - LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } + LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } - static std::string ToLower(const std::string& text) { + static std::string ToLower(const std::string &text) { std::string result = text; std::transform(begin(result), end(result), begin(result), [](unsigned char c) { return std::tolower(c); }); @@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase { } // Returns error string or nullptr if OK. - const char* Validate() const { + const char *Validate() const { const std::string model_type_lc = ToLower(model_type); if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" && model_type_lc != "2b-it" && model_type_lc != "7b-it") { @@ -111,12 +111,11 @@ struct LoaderArgs : public ArgsBase { } Path tokenizer; - Path model; // uncompressed weights OR - Path cache; // compressed weights + Path model; // uncompressed weights OR + Path cache; // compressed weights std::string model_type; - template - void ForEach(const Visitor& visitor) { + template void ForEach(const Visitor &visitor) { visitor(tokenizer, "tokenizer", Path(), "Path name of tokenizer model file. (required)"); visitor( @@ -139,10 +138,10 @@ struct LoaderArgs : public ArgsBase { struct GemmaInterface; struct Gemma { - Gemma(const LoaderArgs& args, hwy::ThreadPool& pool); - ~Gemma(); // must be defined after GemmaInterface's dtor is defined. + Gemma(const LoaderArgs &args, hwy::ThreadPool &pool); + ~Gemma(); // must be defined after GemmaInterface's dtor is defined. - const sentencepiece::SentencePieceProcessor& Tokenizer() const; + const sentencepiece::SentencePieceProcessor &Tokenizer() const; std::unique_ptr impl_; gcpp::ModelTraining model_training; @@ -154,7 +153,7 @@ using StreamFunc = std::function; using AcceptFunc = std::function; struct InferenceArgs : public ArgsBase { - InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } + InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } size_t max_tokens; size_t max_generated_tokens; @@ -164,7 +163,7 @@ struct InferenceArgs : public ArgsBase { bool multiturn; // Returns error string or nullptr if OK. - const char* Validate() const { + const char *Validate() const { if (max_tokens > gcpp::kSeqLen) { return "max_tokens is larger than the maximum sequence length (see " "configs.h)."; @@ -176,8 +175,7 @@ struct InferenceArgs : public ArgsBase { return nullptr; } - template - void ForEach(const Visitor& visitor) { + template void ForEach(const Visitor &visitor) { visitor(max_tokens, "max_tokens", size_t{3072}, "Maximum number of tokens in prompt + generation."); visitor(max_generated_tokens, "max_generated_tokens", size_t{2048}, @@ -186,22 +184,21 @@ struct InferenceArgs : public ArgsBase { visitor(temperature, "temperature", 1.0f, "Temperature for top-K", 2); visitor(deterministic, "deterministic", false, "Make top-k sampling deterministic", 2); - visitor(multiturn, "multiturn", true, + visitor(multiturn, "multiturn", false, "Multiturn mode (if 0, this clears the KV cache after every " - "interaction without quitting)", - 2); + "interaction without quitting)\n Default = 0 (conversation resets every turn)"); } }; -void GenerateGemma(Gemma& gemma, const InferenceArgs& args, - const std::vector& prompt, size_t start_pos, - hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, - const StreamFunc& stream_token, - const AcceptFunc& accept_token, std::mt19937& g, +void GenerateGemma(Gemma &gemma, const InferenceArgs &args, + const std::vector &prompt, size_t start_pos, + hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool, + const StreamFunc &stream_token, + const AcceptFunc &accept_token, std::mt19937 &g, int verbosity); constexpr int EOS_ID = 1; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ +#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ From b3fecef45dbc4d04aa53658347f06de5449aefef Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 26 Feb 2024 22:31:03 -0800 Subject: [PATCH 16/26] Warning fix: sign cast PiperOrigin-RevId: 610635789 --- compression/stats.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compression/stats.cc b/compression/stats.cc index 2013422..8e66119 100644 --- a/compression/stats.cc +++ b/compression/stats.cc @@ -114,7 +114,7 @@ std::string Stats::ToString(int exclude) const { pos += ret; } - HWY_ASSERT(pos < sizeof(buf)); + HWY_ASSERT(pos < static_cast(sizeof(buf))); return buf; } From 179ecf9e7852afbb984de6b13f410559ad464c26 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 26 Feb 2024 22:45:39 -0800 Subject: [PATCH 17/26] Warn instead of assert for setaffinity. Fixes #49 PiperOrigin-RevId: 610638517 --- util/app.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/util/app.h b/util/app.h index bd665a4..5cd316d 100644 --- a/util/app.h +++ b/util/app.h @@ -20,8 +20,11 @@ #if HWY_OS_LINUX #include + +#include // IDE does not recognize errno.h as providing errno. #endif #include +#include #include // std::clamp #include // NOLINT> @@ -38,7 +41,13 @@ static inline void PinThreadToCore(size_t cpu_index) { cpu_set_t cset; // bit array CPU_ZERO(&cset); // clear all CPU_SET(cpu_index, &cset); // set bit indicating which processor to run on. - HWY_ASSERT(0 == sched_setaffinity(0, sizeof(cset), &cset)); + const int err = sched_setaffinity(0, sizeof(cset), &cset); + if (err != 0) { + fprintf(stderr, + "sched_setaffinity returned %d, errno %d. Can happen if running in " + "a container; this warning is safe to ignore.\n", + err, errno); + } #else (void)cpu_index; #endif From 9cdc9223bce51a88de74022f33666309556f14c6 Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Tue, 27 Feb 2024 14:22:02 -0500 Subject: [PATCH 18/26] clean up formatting after 129e66ada2b4e461bdf28b88b70cd2465cb213e4, add .clang-format defaults, minor updates to DEVELOPERS doc --- .clang-format | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++ DEVELOPERS.md | 12 +++ configs.h | 18 ++-- gemma.h | 51 +++++------ 4 files changed, 282 insertions(+), 34 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..c8f8dba --- /dev/null +++ b/.clang-format @@ -0,0 +1,235 @@ +--- +Language: Cpp +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseColons: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterExternBlock: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakAfterAttributes: Never +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Attach +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: false +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +KeepEmptyLinesAtEOF: false +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PackConstructorInitializers: BinPack +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Left +PPIndentWidth: -1 +QualifierAlignment: Leave +ReferenceAlignment: Left +ReflowComments: true +RemoveBracesLLVM: false +RemoveParentheses: Leave +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeJsonColon: false +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: Never +SpacesInContainerLiterals: true +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParens: Never +SpacesInParensOptions: + InCStyleCasts: false + InConditionalStatements: false + InEmptyParentheses: false + Other: false +SpacesInSquareBrackets: false +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +VerilogBreakBetweenInstancePorts: true +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... + diff --git a/DEVELOPERS.md b/DEVELOPERS.md index bdc02c0..7aad9d8 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -71,6 +71,18 @@ The implementation code is roughly split into 4 layers, from high to low level: 4. Backend (`highway`) - Low-level hardware interface (SIMD in the case of highway) supporting the implementations in (3). +Besides these layers, supporting utilities are: + +- `compression/` - model compression operations. the 8-bit switched floating + point model conversion is here. +- `util/` - command line argument handling and any other utilities. + +## Style and Formatting + +A `.clang-format` configuration is provided with our defaults, please run source +files through `clang-format` (or a formatter that produces equivalent behavior) +before finalizing PR for submission. + ## Compile-Time Flags (Advanced) There are several compile-time flags to be aware of (note these may or may not diff --git a/configs.h b/configs.h index 4be5f75..bf25596 100644 --- a/configs.h +++ b/configs.h @@ -21,7 +21,7 @@ // Allow changing pre-allocated kv cache size as a compiler flag #ifndef GEMMA_MAX_SEQLEN #define GEMMA_MAX_SEQLEN 4096 -#endif // !GEMMA_MAX_SEQLEN +#endif // !GEMMA_MAX_SEQLEN #include @@ -34,10 +34,10 @@ struct ConfigGemma7B { static constexpr int kVocabSize = 256128; static constexpr int kLayers = 28; static constexpr int kModelDim = 3072; - static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 + static constexpr int kFFHiddenDim = 16 * 3072 / 2; // = 24576 static constexpr int kHeads = 16; - static constexpr int kKVHeads = 16; // standard MHA - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kKVHeads = 16; // standard MHA + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; @@ -46,13 +46,13 @@ struct ConfigGemma2B { static constexpr int kVocabSize = 256128; static constexpr int kLayers = 18; static constexpr int kModelDim = 2048; - static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 + static constexpr int kFFHiddenDim = 16 * 2048 / 2; // = 16384 static constexpr int kHeads = 8; - static constexpr int kKVHeads = 8; // TODO(austinvhuang): add MQA support - static constexpr int kQKVDim = 256; // query size == key size == value size + static constexpr int kKVHeads = 8; // TODO(austinvhuang): add MQA support + static constexpr int kQKVDim = 256; // query size == key size == value size static constexpr int kTopK = 1; }; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ +#endif // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_ diff --git a/gemma.h b/gemma.h index 1e76a37..12c2a77 100644 --- a/gemma.h +++ b/gemma.h @@ -27,12 +27,12 @@ // copybara:import_next_line:gemma_cpp #include "compression/compress.h" // SfpStream/NuqStream // copybara:import_next_line:gemma_cpp -#include "configs.h" // kSeqLen +#include "configs.h" // kSeqLen // copybara:import_next_line:gemma_cpp -#include "util/args.h" // ArgsBase #include "hwy/aligned_allocator.h" #include "hwy/base.h" // hwy::bfloat16_t #include "hwy/contrib/thread_pool/thread_pool.h" +#include "util/args.h" // ArgsBase // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" @@ -42,7 +42,7 @@ namespace gcpp { // float, hwy::bfloat16_t, SfpStream, NuqStream #ifndef GEMMA_WEIGHT_T #define GEMMA_WEIGHT_T SfpStream -#endif // !GEMMA_WEIGHT_T +#endif // !GEMMA_WEIGHT_T using WeightT = GEMMA_WEIGHT_T; using EmbedderInputT = hwy::bfloat16_t; @@ -51,9 +51,9 @@ constexpr bool kSystemPrompt = false; struct KVCache { hwy::AlignedFreeUniquePtr - key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + key_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim hwy::AlignedFreeUniquePtr - value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim + value_cache; // batch_size * kSeqLen * kLayers * kKVHeads * kQKVDim }; // Model variants: see configs.h for details. @@ -61,9 +61,9 @@ enum class Model { GEMMA_2B, GEMMA_7B }; enum class ModelTraining { GEMMA_IT, GEMMA_PT }; struct LoaderArgs : public ArgsBase { - LoaderArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } + LoaderArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } - static std::string ToLower(const std::string &text) { + static std::string ToLower(const std::string& text) { std::string result = text; std::transform(begin(result), end(result), begin(result), [](unsigned char c) { return std::tolower(c); }); @@ -89,7 +89,7 @@ struct LoaderArgs : public ArgsBase { } // Returns error string or nullptr if OK. - const char *Validate() const { + const char* Validate() const { const std::string model_type_lc = ToLower(model_type); if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" && model_type_lc != "2b-it" && model_type_lc != "7b-it") { @@ -111,11 +111,11 @@ struct LoaderArgs : public ArgsBase { } Path tokenizer; - Path model; // uncompressed weights OR - Path cache; // compressed weights + Path model; // uncompressed weights OR + Path cache; // compressed weights std::string model_type; - template void ForEach(const Visitor &visitor) { + template void ForEach(const Visitor& visitor) { visitor(tokenizer, "tokenizer", Path(), "Path name of tokenizer model file. (required)"); visitor( @@ -138,10 +138,10 @@ struct LoaderArgs : public ArgsBase { struct GemmaInterface; struct Gemma { - Gemma(const LoaderArgs &args, hwy::ThreadPool &pool); - ~Gemma(); // must be defined after GemmaInterface's dtor is defined. + Gemma(const LoaderArgs& args, hwy::ThreadPool& pool); + ~Gemma(); // must be defined after GemmaInterface's dtor is defined. - const sentencepiece::SentencePieceProcessor &Tokenizer() const; + const sentencepiece::SentencePieceProcessor& Tokenizer() const; std::unique_ptr impl_; gcpp::ModelTraining model_training; @@ -153,7 +153,7 @@ using StreamFunc = std::function; using AcceptFunc = std::function; struct InferenceArgs : public ArgsBase { - InferenceArgs(int argc, char *argv[]) { InitAndParse(argc, argv); } + InferenceArgs(int argc, char* argv[]) { InitAndParse(argc, argv); } size_t max_tokens; size_t max_generated_tokens; @@ -163,7 +163,7 @@ struct InferenceArgs : public ArgsBase { bool multiturn; // Returns error string or nullptr if OK. - const char *Validate() const { + const char* Validate() const { if (max_tokens > gcpp::kSeqLen) { return "max_tokens is larger than the maximum sequence length (see " "configs.h)."; @@ -175,7 +175,7 @@ struct InferenceArgs : public ArgsBase { return nullptr; } - template void ForEach(const Visitor &visitor) { + template void ForEach(const Visitor& visitor) { visitor(max_tokens, "max_tokens", size_t{3072}, "Maximum number of tokens in prompt + generation."); visitor(max_generated_tokens, "max_generated_tokens", size_t{2048}, @@ -186,19 +186,20 @@ struct InferenceArgs : public ArgsBase { "Make top-k sampling deterministic", 2); visitor(multiturn, "multiturn", false, "Multiturn mode (if 0, this clears the KV cache after every " - "interaction without quitting)\n Default = 0 (conversation resets every turn)"); + "interaction without quitting)\n Default = 0 (conversation " + "resets every turn)"); } }; -void GenerateGemma(Gemma &gemma, const InferenceArgs &args, - const std::vector &prompt, size_t start_pos, - hwy::ThreadPool &pool, hwy::ThreadPool &inner_pool, - const StreamFunc &stream_token, - const AcceptFunc &accept_token, std::mt19937 &g, +void GenerateGemma(Gemma& gemma, const InferenceArgs& args, + const std::vector& prompt, size_t start_pos, + hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, + const StreamFunc& stream_token, + const AcceptFunc& accept_token, std::mt19937& g, int verbosity); constexpr int EOS_ID = 1; -} // namespace gcpp +} // namespace gcpp -#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ +#endif // THIRD_PARTY_GEMMA_CPP_GEMMA_H_ From 874deee3028716aa5c89c6e8d903fb10904ef1dc Mon Sep 17 00:00:00 2001 From: Dan Zheng Date: Tue, 27 Feb 2024 11:32:33 -0800 Subject: [PATCH 19/26] Update DEVELOPERS.md --- DEVELOPERS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 7aad9d8..557670a 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -73,7 +73,7 @@ The implementation code is roughly split into 4 layers, from high to low level: Besides these layers, supporting utilities are: -- `compression/` - model compression operations. the 8-bit switched floating +- `compression/` - model compression operations. The 8-bit switched floating point model conversion is here. - `util/` - command line argument handling and any other utilities. From f70d2de16f8acf7cad78036aa9a24e1e6c441b59 Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Tue, 27 Feb 2024 15:44:03 -0500 Subject: [PATCH 20/26] use `style=Google` - dumped for .clang-format, gemma.h updated --- .clang-format | 81 ++++++++++++++++++++++++++++++++++++--------------- gemma.h | 6 ++-- 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/.clang-format b/.clang-format index c8f8dba..523dc01 100644 --- a/.clang-format +++ b/.clang-format @@ -1,6 +1,7 @@ --- Language: Cpp -AccessModifierOffset: -2 +# BasedOnStyle: Google +AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignArrayOfStructures: None AlignConsecutiveAssignments: @@ -32,7 +33,7 @@ AlignConsecutiveShortCaseStatements: AcrossEmptyLines: false AcrossComments: false AlignCaseColons: false -AlignEscapedNewlines: Right +AlignEscapedNewlines: Left AlignOperands: Align AlignTrailingComments: Kind: Always @@ -43,13 +44,13 @@ AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: false AllowShortEnumsOnASingleLine: true AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never +AllowShortIfStatementsOnASingleLine: WithoutElse AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: false +AllowShortLoopsOnASingleLine: true AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: MultiLine +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes AttributeMacros: - __capability BinPackArguments: true @@ -91,7 +92,7 @@ CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true -DerivePointerAlignment: false +DerivePointerAlignment: true DisableFormat: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: LogicalBlock @@ -103,25 +104,29 @@ ForEachMacros: - BOOST_FOREACH IfMacros: - KJ_IF_MAYBE -IncludeBlocks: Preserve +IncludeBlocks: Regroup IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + - Regex: '^' Priority: 2 SortPriority: 0 CaseSensitive: false - - Regex: '^(<|"(gtest|gmock|isl|json)/)' - Priority: 3 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' + - Regex: '^<.*\.h>' Priority: 1 SortPriority: 0 CaseSensitive: false -IncludeIsMainRegex: '(Test)?$' + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 3 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '([-_](test|unittest))?$' IncludeIsMainSourceRegex: '' IndentAccessModifiers: false IndentCaseBlocks: false -IndentCaseLabels: false +IndentCaseLabels: true IndentExternBlock: AfterExternBlock IndentGotoLabels: true IndentPPDirectives: None @@ -140,7 +145,7 @@ IntegerLiteralSeparator: HexMinDigits: 0 JavaScriptQuotes: Leave JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: true +KeepEmptyLinesAtTheStartOfBlocks: false KeepEmptyLinesAtEOF: false LambdaBodyIndentation: Signature LineEnding: DeriveLF @@ -148,14 +153,14 @@ MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None -ObjCBinPackProtocolList: Auto +ObjCBinPackProtocolList: Never ObjCBlockIndentWidth: 2 ObjCBreakBeforeNestedBlockParam: true ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: true -PackConstructorInitializers: BinPack +PackConstructorInitializers: NextLine PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakOpenParenthesis: 0 @@ -163,11 +168,41 @@ PenaltyBreakString: 1000 PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyIndentedWhitespace: 0 -PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left PPIndentWidth: -1 QualifierAlignment: Leave -ReferenceAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + - ParseTestProto + - ParsePartialTestProto + CanonicalDelimiter: pb + BasedOnStyle: google +ReferenceAlignment: Pointer ReflowComments: true RemoveBracesLLVM: false RemoveParentheses: Leave @@ -216,7 +251,7 @@ SpacesInParensOptions: InEmptyParentheses: false Other: false SpacesInSquareBrackets: false -Standard: Latest +Standard: Auto StatementAttributeLikeMacros: - Q_EMIT StatementMacros: diff --git a/gemma.h b/gemma.h index 12c2a77..a218878 100644 --- a/gemma.h +++ b/gemma.h @@ -115,7 +115,8 @@ struct LoaderArgs : public ArgsBase { Path cache; // compressed weights std::string model_type; - template void ForEach(const Visitor& visitor) { + template + void ForEach(const Visitor& visitor) { visitor(tokenizer, "tokenizer", Path(), "Path name of tokenizer model file. (required)"); visitor( @@ -175,7 +176,8 @@ struct InferenceArgs : public ArgsBase { return nullptr; } - template void ForEach(const Visitor& visitor) { + template + void ForEach(const Visitor& visitor) { visitor(max_tokens, "max_tokens", size_t{3072}, "Maximum number of tokens in prompt + generation."); visitor(max_generated_tokens, "max_generated_tokens", size_t{2048}, From 8f3bd63bf74805851147a22cd3b5fbeacd8b5fc4 Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Tue, 27 Feb 2024 17:11:15 -0500 Subject: [PATCH 21/26] Fix copybara include path substitutions errors (which break the google3 build) arising from clang-format linter automation --- .clang-format | 3 +-- DEVELOPERS.md | 8 ++++++++ gemma.h | 2 +- util/make_clang_format_config.sh | 4 ++++ 4 files changed, 14 insertions(+), 3 deletions(-) create mode 100755 util/make_clang_format_config.sh diff --git a/.clang-format b/.clang-format index 523dc01..3465c13 100644 --- a/.clang-format +++ b/.clang-format @@ -1,6 +1,5 @@ --- Language: Cpp -# BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignArrayOfStructures: None @@ -211,7 +210,7 @@ RequiresClausePosition: OwnLine RequiresExpressionIndentation: OuterScope SeparateDefinitionBlocks: Leave ShortNamespaceLines: 1 -SortIncludes: CaseSensitive +SortIncludes: Never SortJavaStaticImport: Before SortUsingDeclarations: LexicographicNumeric SpaceAfterCStyleCast: false diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 557670a..8e09ee8 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -83,6 +83,14 @@ A `.clang-format` configuration is provided with our defaults, please run source files through `clang-format` (or a formatter that produces equivalent behavior) before finalizing PR for submission. +The `.clang-format` is the google style (as of feb 27 2024), except with +`SortIncludes` set to `false` to avoid breaking copybara path substitutions +which rely on adjacent comments. + +For transparency, `.clang-format` can be reproduced using the +`make_clang_format_config.sh` script in `utils/` run with `clang-format` version +17.0.6. + ## Compile-Time Flags (Advanced) There are several compile-time flags to be aware of (note these may or may not diff --git a/gemma.h b/gemma.h index a218878..2d5e713 100644 --- a/gemma.h +++ b/gemma.h @@ -29,10 +29,10 @@ // copybara:import_next_line:gemma_cpp #include "configs.h" // kSeqLen // copybara:import_next_line:gemma_cpp +#include "util/args.h" // ArgsBase #include "hwy/aligned_allocator.h" #include "hwy/base.h" // hwy::bfloat16_t #include "hwy/contrib/thread_pool/thread_pool.h" -#include "util/args.h" // ArgsBase // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" diff --git a/util/make_clang_format_config.sh b/util/make_clang_format_config.sh new file mode 100755 index 0000000..5261e2c --- /dev/null +++ b/util/make_clang_format_config.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# Reproduces .clang-format file. +clang-format -style="{BasedOnStyle: Google, SortIncludes: false}" -dump-config > .clang-format From d37f9c36042152037c5a99dffea47b448f2d876b Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Tue, 27 Feb 2024 21:23:33 -0500 Subject: [PATCH 22/26] re-enable SortIncludes to conform to vanilla Google style, add comment lines to #includes in gemma.h as barriers to block destructive sorting, update doc + remove shell script --- .clang-format | 270 +------------------------------ DEVELOPERS.md | 8 - gemma.h | 4 + util/make_clang_format_config.sh | 4 - 4 files changed, 5 insertions(+), 281 deletions(-) delete mode 100755 util/make_clang_format_config.sh diff --git a/.clang-format b/.clang-format index 3465c13..f6cb8ad 100644 --- a/.clang-format +++ b/.clang-format @@ -1,269 +1 @@ ---- -Language: Cpp -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveAssignments: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: true -AlignConsecutiveBitFields: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveDeclarations: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveMacros: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - PadOperators: false -AlignConsecutiveShortCaseStatements: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCaseColons: false -AlignEscapedNewlines: Left -AlignOperands: Align -AlignTrailingComments: - Kind: Always - OverEmptyLines: 0 -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortEnumsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: WithoutElse -AllowShortLambdasOnASingleLine: All -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: Yes -AttributeMacros: - - __capability -BinPackArguments: true -BinPackParameters: true -BitFieldColonSpacing: Both -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: Never - AfterEnum: false - AfterExternBlock: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakAfterAttributes: Never -BreakAfterJavaFieldAnnotations: false -BreakArrays: true -BreakBeforeBinaryOperators: None -BreakBeforeConceptDeclarations: Always -BreakBeforeBraces: Attach -BreakBeforeInlineASMColon: OnlyMultiline -BreakBeforeTernaryOperators: true -BreakConstructorInitializers: BeforeColon -BreakInheritanceList: BeforeColon -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DerivePointerAlignment: true -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IfMacros: - - KJ_IF_MAYBE -IncludeBlocks: Regroup -IncludeCategories: - - Regex: '^' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*\.h>' - Priority: 1 - SortPriority: 0 - CaseSensitive: false - - Regex: '^<.*' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 3 - SortPriority: 0 - CaseSensitive: false -IncludeIsMainRegex: '([-_](test|unittest))?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseBlocks: false -IndentCaseLabels: true -IndentExternBlock: AfterExternBlock -IndentGotoLabels: true -IndentPPDirectives: None -IndentRequiresClause: true -IndentWidth: 2 -IndentWrappedFunctionNames: false -InsertBraces: false -InsertNewlineAtEOF: false -InsertTrailingCommas: None -IntegerLiteralSeparator: - Binary: 0 - BinaryMinDigits: 0 - Decimal: 0 - DecimalMinDigits: 0 - Hex: 0 - HexMinDigits: 0 -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -KeepEmptyLinesAtEOF: false -LambdaBodyIndentation: Signature -LineEnding: DeriveLF -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 2 -ObjCBreakBeforeNestedBlockParam: true -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PackConstructorInitializers: NextLine -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyIndentedWhitespace: 0 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -PPIndentWidth: -1 -QualifierAlignment: Leave -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - - ParseTestProto - - ParsePartialTestProto - CanonicalDelimiter: pb - BasedOnStyle: google -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -RemoveParentheses: Leave -RemoveSemicolon: false -RequiresClausePosition: OwnLine -RequiresExpressionIndentation: OuterScope -SeparateDefinitionBlocks: Leave -ShortNamespaceLines: 1 -SortIncludes: Never -SortJavaStaticImport: Before -SortUsingDeclarations: LexicographicNumeric -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceAroundPointerQualifiers: Default -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeJsonColon: false -SpaceBeforeParens: ControlStatements -SpaceBeforeParensOptions: - AfterControlStatements: true - AfterForeachMacros: true - AfterFunctionDefinitionName: false - AfterFunctionDeclarationName: false - AfterIfMacros: true - AfterOverloadedOperator: false - AfterRequiresInClause: false - AfterRequiresInExpression: false - BeforeNonEmptyParentheses: false -SpaceBeforeRangeBasedForLoopColon: true -SpaceBeforeSquareBrackets: false -SpaceInEmptyBlock: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: Never -SpacesInContainerLiterals: true -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParens: Never -SpacesInParensOptions: - InCStyleCasts: false - InConditionalStatements: false - InEmptyParentheses: false - Other: false -SpacesInSquareBrackets: false -Standard: Auto -StatementAttributeLikeMacros: - - Q_EMIT -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 8 -UseTab: Never -VerilogBreakBetweenInstancePorts: true -WhitespaceSensitiveMacros: - - BOOST_PP_STRINGIZE - - CF_SWIFT_NAME - - NS_SWIFT_NAME - - PP_STRINGIZE - - STRINGIZE -... - +BasedOnStyle: Google diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 8e09ee8..557670a 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -83,14 +83,6 @@ A `.clang-format` configuration is provided with our defaults, please run source files through `clang-format` (or a formatter that produces equivalent behavior) before finalizing PR for submission. -The `.clang-format` is the google style (as of feb 27 2024), except with -`SortIncludes` set to `false` to avoid breaking copybara path substitutions -which rely on adjacent comments. - -For transparency, `.clang-format` can be reproduced using the -`make_clang_format_config.sh` script in `utils/` run with `clang-format` version -17.0.6. - ## Compile-Time Flags (Advanced) There are several compile-time flags to be aware of (note these may or may not diff --git a/gemma.h b/gemma.h index 2d5e713..1ff98c1 100644 --- a/gemma.h +++ b/gemma.h @@ -26,15 +26,19 @@ // copybara:import_next_line:gemma_cpp #include "compression/compress.h" // SfpStream/NuqStream +// copybara:end // copybara:import_next_line:gemma_cpp #include "configs.h" // kSeqLen +// copybara:end // copybara:import_next_line:gemma_cpp #include "util/args.h" // ArgsBase +// copybara:end #include "hwy/aligned_allocator.h" #include "hwy/base.h" // hwy::bfloat16_t #include "hwy/contrib/thread_pool/thread_pool.h" // copybara:import_next_line:sentencepiece #include "src/sentencepiece_processor.h" +// copybara:end namespace gcpp { diff --git a/util/make_clang_format_config.sh b/util/make_clang_format_config.sh deleted file mode 100755 index 5261e2c..0000000 --- a/util/make_clang_format_config.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# Reproduces .clang-format file. -clang-format -style="{BasedOnStyle: Google, SortIncludes: false}" -dump-config > .clang-format From 060c8862ddb4e63a862ecae6e9ea8cbec91c7cec Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Tue, 27 Feb 2024 21:36:43 -0500 Subject: [PATCH 23/26] whitespace cleanup --- DEVELOPERS.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 557670a..f670c49 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -73,7 +73,7 @@ The implementation code is roughly split into 4 layers, from high to low level: Besides these layers, supporting utilities are: -- `compression/` - model compression operations. The 8-bit switched floating +- `compression/` - model compression operations. The 8-bit switched floating point model conversion is here. - `util/` - command line argument handling and any other utilities. @@ -85,17 +85,17 @@ before finalizing PR for submission. ## Compile-Time Flags (Advanced) -There are several compile-time flags to be aware of (note these may or may not +There are several compile-time flags to be aware of (note these may or may not be exposed to the build system): -- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as - WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` - (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to +- `GEMMA_WEIGHT_T` : Sets the level of compression for weights (surfaced as + WEIGHT_TYPE in CMakeLists.txt). Currently this should be set to `SfpStream` + (default, if no flag is specified) for 8-bit SFP, or `hwy::bfloat16_t` to enable for higher-fidelity (but slower) bfloat16 support. This is defined in `gemma.h`. - `GEMMA_MAX_SEQ_LEN` : Sets maximum sequence length to preallocate for the KV Cache. The default is 4096 tokens but can be overridden. This is not exposed - through `CMakeLists.txt` yet. + through `CMakeLists.txt` yet. In the medium term both of these will likely be deprecated in favor of handling options at runtime - allowing for multiple weight compression schemes in a single From 272f17ddb3dfae0d45381262a44cd079564171d4 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 28 Feb 2024 05:53:52 -0800 Subject: [PATCH 24/26] Warning fixes: unused member, cast, unused function PiperOrigin-RevId: 611074887 --- compression/distortion.h | 2 ++ gemma.cc | 20 +++++++++++--------- util/args.h | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/compression/distortion.h b/compression/distortion.h index 8c0742a..5fd778f 100644 --- a/compression/distortion.h +++ b/compression/distortion.h @@ -25,6 +25,8 @@ namespace gcpp { class DistortionStats { public: void Notify(float original, float distorted) { + (void)padding_; // prevent unused member warning + const double l1 = hwy::ScalarAbs(original - distorted); if (l1 > max_l1_) { diff --git a/gemma.cc b/gemma.cc index 70777ac..4775f89 100644 --- a/gemma.cc +++ b/gemma.cc @@ -633,30 +633,32 @@ void ForEachTensor(const Weights* weights, c_weights.c_final_norm_scale); char name[16]; - for (size_t layer_idx = 0; layer_idx < TConfig::kLayers; ++layer_idx) { - Layer* layer = weights ? &weights->layers[layer_idx] : nullptr; - CompressedLayer* c_layer = c_weights.CLayer(layer_idx); + for (int layer_idx = 0; layer_idx < static_cast(TConfig::kLayers); + ++layer_idx) { + const size_t idx = static_cast(layer_idx); + Layer* layer = weights ? &weights->layers[idx] : nullptr; + CompressedLayer* c_layer = c_weights.CLayer(idx); - snprintf(name, sizeof(name), "pre_ff_ns_%lu", layer_idx); + snprintf(name, sizeof(name), "pre_ff_ns_%d", layer_idx); func(name, layer ? layer->pre_ffw_norm_scale.data() : nullptr, c_layer->c_pre_ffw_norm_scale); - snprintf(name, sizeof(name), "gating_ein_%lu", layer_idx); + snprintf(name, sizeof(name), "gating_ein_%d", layer_idx); func(name, layer ? layer->gating_einsum_w.data() : nullptr, c_layer->c_gating_einsum_w); - snprintf(name, sizeof(name), "linear_w_%lu", layer_idx); + snprintf(name, sizeof(name), "linear_w_%d", layer_idx); func(name, layer ? layer->linear_w.data() : nullptr, c_layer->c_linear_w); - snprintf(name, sizeof(name), "qkv_ein_%lu", layer_idx); + snprintf(name, sizeof(name), "qkv_ein_%d", layer_idx); func(name, layer ? layer->qkv_einsum_w.data() : nullptr, c_layer->c_qkv_einsum_w); - snprintf(name, sizeof(name), "att_ein_%lu", layer_idx); + snprintf(name, sizeof(name), "att_ein_%d", layer_idx); func(name, layer ? layer->attn_vec_einsum_w.data() : nullptr, c_layer->c_attn_vec_einsum_w); - snprintf(name, sizeof(name), "pre_att_ns_%lu", layer_idx); + snprintf(name, sizeof(name), "pre_att_ns_%d", layer_idx); func(name, layer ? layer->pre_attention_norm_scale.data() : nullptr, c_layer->c_pre_attention_norm_scale); } diff --git a/util/args.h b/util/args.h index ce03ef2..b9ab985 100644 --- a/util/args.h +++ b/util/args.h @@ -204,7 +204,7 @@ class ArgsBase { } }; -static bool HasHelp(int argc, char* argv[]) { +static inline HWY_MAYBE_UNUSED bool HasHelp(int argc, char* argv[]) { // TODO(austinvhuang): handle case insensitivity if (argc == 1) { // no arguments - print help From 0ea7b993def742f79c6f4b584ac4b3b127d8edd8 Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Wed, 28 Feb 2024 15:18:40 -0500 Subject: [PATCH 25/26] remove --log fixing https://github.com/google/gemma.cpp/issues/59, improve command line args help, add copybara #include sort guards in more source files, add README sections on running faster and related projects --- README.md | 30 ++++++++++++++++++++++-- gemma.h | 17 +++++++------- run.cc | 69 ++++++++++++++++++++++++++++++++++++------------------ util/app.h | 19 ++++++++------- 4 files changed, 93 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 8db6862..331d96f 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ weights enable faster inference. In general, we recommend starting with the | `7b-pt` | 7 billion parameter pre-trained model, bfloat16 | | `7b-pt-sfp` | 7 billion parameter pre-trained model, 8-bit switched floating point | -> [!NOTE] +> [!NOTE] > **Important**: We strongly recommend starting off with the `2b-it-sfp` model to > get up and running. @@ -116,7 +116,7 @@ The build system uses [CMake](https://cmake.org/). To build the gemma inference runtime, create a build directory and generate the build files using `cmake` from the top-level project directory. Note if you previous ran `cmake` and are re-running with a different setting, be sure to clean out the `build/` directory -with `rm -rf build/*` (warning this will delete any other files in the `build/` +with `rm -rf build/*` (warning this will delete any other files in the `build/` directory. For the 8-bit switched floating point weights (sfp), run cmake with no options: @@ -242,6 +242,21 @@ We're working on a python script to convert a standard model format to `.sbs`, and hope have it available in the next week or so. Follow [this issue](https://github.com/google/gemma.cpp/issues/11) for updates. +**What are some easy ways to make the model run faster?** + +1. Make sure you are using the 8-bit switched floating point `-sfp` models. +2. If you're on a laptop, make sure power mode is set to maximize performance +and saving mode is **off**. For most laptops, the power saving modes get +activated automatically if the computer is not plugged in. +3. Close other unused cpu-intensive applications. +4. On macs, anecdotally we observe a "warm-up" ramp-up in speed as performance +cores get engaged. +5. Experiment with the `--num_threads` argument value. Depending on the device, +larger numbers don't always mean better performance. + +We're also working on algorithmic and optimization approaches for faster +inference, stay tuned. + ## Usage `gemma` has different usage modes, controlled by the verbosity flag. @@ -415,6 +430,17 @@ make -j [number of parallel threads to use] libgemma If this is successful, you should now have a `libgemma` library file in the `build/` directory. On Unix platforms, the filename is `libgemma.a`. +## Independent Projects Using gemma.cpp + +Some independent projects using gemma.cpp: + +- [gemma-cpp-python - Python bindings](https://github.com/namtranase/gemma-cpp-python) +- [lua-cgemma - Lua bindings](https://github.com/ufownl/lua-cgemma) +- [Godot engine demo project](https://github.com/Rliop913/Gemma-godot-demo-project) + +If you would like to have your project included, feel free to get in touch or +submit a PR with a `README.md` edit. + ## Acknowledgements and Contacts gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.com) diff --git a/gemma.h b/gemma.h index 1ff98c1..7195bc9 100644 --- a/gemma.h +++ b/gemma.h @@ -122,21 +122,22 @@ struct LoaderArgs : public ArgsBase { template void ForEach(const Visitor& visitor) { visitor(tokenizer, "tokenizer", Path(), - "Path name of tokenizer model file. (required)"); + "Path name of tokenizer model file.\n Required argument."); visitor( cache, "compressed_weights", Path(), "Path name of compressed weights file, regenerated from `--weights` " "file if " - "the compressed weights file does not exist. (required)"); + "the compressed weights file does not exist.\n Required argument."); visitor(model_type, "model", std::string(), - "Model type - can be 2b-it (2B parameters, instruction-tuned), " - "2b-pt (2B parameters, pretrained), 7b-it (7B parameters, " - "instruction-tuned), or 7b-pt (7B parameters, pretrained). " - "(required)"); + "Model type\n 2b-it (2B parameters, instruction-tuned)\n " + "2b-pt (2B parameters, pretrained)\n 7b-it (7B parameters " + "instruction-tuned)\n 7b-pt (7B parameters, pretrained)\n" + " Required argument."); visitor(model, "weights", Path(), "Path name of model weights (.sbs) file. Only required if " "compressed_weights file is not present and needs to be " - "regenerated. Otherwise, not needed"); + "regenerated. This parameter is only required for compressing" + "new model weight exports, otherwise it is not needed."); } }; @@ -192,7 +193,7 @@ struct InferenceArgs : public ArgsBase { "Make top-k sampling deterministic", 2); visitor(multiturn, "multiturn", false, "Multiturn mode (if 0, this clears the KV cache after every " - "interaction without quitting)\n Default = 0 (conversation " + "interaction without quitting)\n Default : 0 (conversation " "resets every turn)"); } }; diff --git a/run.cc b/run.cc index 2d9a15e..507979d 100644 --- a/run.cc +++ b/run.cc @@ -24,12 +24,16 @@ // copybara:import_next_line:gemma_cpp #include "compression/compress.h" +// copybara:end // copybara:import_next_line:gemma_cpp -#include "gemma.h" // Gemma +#include "gemma.h" // Gemma +// copybara:end // copybara:import_next_line:gemma_cpp #include "util/app.h" +// copybara:end // copybara:import_next_line:gemma_cpp #include "util/args.h" // HasHelp +// copybara:end #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/highway.h" @@ -39,20 +43,13 @@ namespace gcpp { -void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference, - gcpp::AppArgs& app) { - fprintf(stderr, - "\ngemma.cpp\n---------\n\nTo run gemma.cpp, you need to " - "specify 3 required model loading arguments: --tokenizer, " - "--compressed_weights, " - "and --model.\n\nModel Loading Arguments\n\n"); - loader.Help(); - fprintf(stderr, "\nInference Arguments\n\n"); - inference.Help(); - fprintf(stderr, "\nApplication Arguments\n\n"); - app.Help(); - fprintf(stderr, "\n\n"); -} +static constexpr std::string_view kAsciiArtBanner = + " __ _ ___ _ __ ___ _ __ ___ __ _ ___ _ __ _ __\n" + " / _` |/ _ \\ '_ ` _ \\| '_ ` _ \\ / _` | / __| '_ \\| '_ \\\n" + "| (_| | __/ | | | | | | | | | | (_| || (__| |_) | |_) |\n" + " \\__, |\\___|_| |_| |_|_| |_| |_|\\__,_(_)___| .__/| .__/\n" + " __/ | | | | |\n" + " |___/ |_| |_|"; void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { loader.Print(app.verbosity); @@ -69,7 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { << std::thread::hardware_concurrency() << std::endl << "Instruction set : " << hwy::TargetName(hwy::DispatchedTarget()) << " (" - << hwy::VectorBytes() * 8 << " bits)" << "\n" + << hwy::VectorBytes() * 8 << " bits)" + << "\n" << "Weight Type : " << gcpp::TypeName(gcpp::WeightT()) << "\n" << "EmbedderInput Type : " @@ -77,11 +75,31 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { } } +void ShowHelp(gcpp::LoaderArgs& loader, gcpp::InferenceArgs& inference, + gcpp::AppArgs& app) { + std::cerr + << kAsciiArtBanner + << "\n\ngemma.cpp : a lightweight, standalone C++ inference engine\n" + "==========================================================\n\n" + "To run gemma.cpp, you need to " + "specify 3 required model loading arguments:\n --tokenizer\n " + "--compressed_weights\n" + " --model.\n"; + std::cerr << "\n*Example Usage*\n\n./gemma --tokenizer tokenizer.spm " + "--compressed_weights 2b-it-sfp.sbs --model 2b-it\n"; + std::cerr << "\n*Model Loading Arguments*\n\n"; + loader.Help(); + std::cerr << "\n*Inference Arguments*\n\n"; + inference.Help(); + std::cerr << "\n*Application Arguments*\n\n"; + app.Help(); + std::cerr << "\n"; +} + void ReplGemma(gcpp::Gemma& model, hwy::ThreadPool& pool, hwy::ThreadPool& inner_pool, const InferenceArgs& args, int verbosity, const gcpp::AcceptFunc& accept_token, - std::string &eot_line -) { + std::string& eot_line) { PROFILER_ZONE("Gen.misc"); int abs_pos = 0; // absolute token index over all turns int current_pos = 0; // token index within the current turn @@ -234,8 +252,12 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { const std::string instructions = "*Usage*\n" - " Enter an instruction and press enter (%C reset conversation, " - "%Q quits).\n\n" + " Enter an instruction and press enter (%C resets conversation, " + "%Q quits).\n" + + (inference.multiturn == 0 + ? std::string(" Since multiturn is set to 0, conversation will " + "automatically reset every turn.\n\n") + : "\n") + "*Examples*\n" " - Write an email to grandma thanking her for the cookies.\n" " - What are some historical attractions to visit around " @@ -244,13 +266,14 @@ void Run(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { " - Write a standup comedy bit about GPU programming.\n"; std::cout << "\033[2J\033[1;1H" // clear screen - << banner_ascii_art << "\n\n"; + << kAsciiArtBanner << "\n\n"; ShowConfig(loader, inference, app); std::cout << "\n" << instructions << "\n"; } - ReplGemma(model, pool, inner_pool, inference, app.verbosity, - /*accept_token=*/[](int) { return true; }, app.eot_line); + ReplGemma( + model, pool, inner_pool, inference, app.verbosity, + /*accept_token=*/[](int) { return true; }, app.eot_line); } } // namespace gcpp diff --git a/util/app.h b/util/app.h index f66a6cd..7f926a5 100644 --- a/util/app.h +++ b/util/app.h @@ -31,6 +31,7 @@ // copybara:import_next_line:gemma_cpp #include "util/args.h" +// copybara:end #include "hwy/base.h" // HWY_ASSERT namespace gcpp { @@ -77,7 +78,6 @@ class AppArgs : public ArgsBase { template void ForEach(const Visitor& visitor) { - visitor(log, "log", Path{"/tmp/log.txt"}, "Logging file", 2); visitor(verbosity, "verbosity", 1, "Show verbose developer information\n 0 = only print generation " "output\n 1 = standard user-facing terminal ui\n 2 = show " @@ -85,15 +85,16 @@ class AppArgs : public ArgsBase { 2); visitor(num_threads, "num_threads", kDefaultNumThreads, // see ChooseNumThreads - "Number of threads to use. Default value is set based on an " - "estimate of " - "how many concurrent threads are supported.", - 2); - visitor(eot_line, "eot_line", std::string(""), - "End of turn line. " - "When you specify this, the prompt will be all lines " - "before the line where only the given string appears.", + "Number of threads to use.\n Default = Estimate of the " + "number of suupported concurrent threads.", 2); + visitor( + eot_line, "eot_line", std::string(""), + "End of turn line. " + "When you specify this, the prompt will be all lines " + "before the line where only the given string appears.\n Default = " + "When a newline is encountered, that signals the end of the turn.", + 2); } }; From b6aaf6bbb8a7f0b99330df8710765123ab330766 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 28 Feb 2024 15:29:45 -0800 Subject: [PATCH 26/26] Fix for Android's 32-bit off_t. Fixes #62 PiperOrigin-RevId: 611249534 --- compression/blob_store.cc | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/compression/blob_store.cc b/compression/blob_store.cc index 550c727..e088fc6 100644 --- a/compression/blob_store.cc +++ b/compression/blob_store.cc @@ -13,6 +13,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Request POSIX 2008, including `pread()` and `posix_fadvise()`. +#if !defined(_XOPEN_SOURCE) || _XOPEN_SOURCE < 700 +#undef _XOPEN_SOURCE +#define _XOPEN_SOURCE 700 +#endif +#if !defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE < 200809 +#define _POSIX_C_SOURCE 200809 +#endif + +// Make `off_t` 64-bit even on 32-bit systems. Works for Android >= r15c. +#undef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 + // copybara:import_next_line:gemma_cpp #include "compression/blob_store.h" @@ -81,7 +94,7 @@ static int64_t pwrite(int fd, const void* buf, uint64_t size, uint64_t offset) { } #endif -} +} // namespace namespace gcpp { @@ -133,6 +146,7 @@ struct IO { return 0; } #else + static_assert(sizeof(off_t) == 8, "64-bit off_t required"); const off_t size = lseek(fd, 0, SEEK_END); HWY_ASSERT(close(fd) != -1); if (size == static_cast(-1)) { @@ -318,7 +332,8 @@ class BlobStore { BlobError BlobReader::Open(const char* filename) { #if HWY_OS_WIN DWORD flags = FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN; - HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING, flags, nullptr); + HANDLE file = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, nullptr, + OPEN_EXISTING, flags, nullptr); if (file == INVALID_HANDLE_VALUE) return __LINE__; fd_ = _open_osfhandle(reinterpret_cast(file), _O_RDONLY); #else @@ -326,7 +341,7 @@ BlobError BlobReader::Open(const char* filename) { #endif if (fd_ < 0) return __LINE__; -#if _POSIX_C_SOURCE >= 200112L +#if HWY_OS_LINUX // Doubles the readahead window, which seems slightly faster when cached. (void)posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); #endif @@ -403,7 +418,8 @@ BlobError BlobWriter::WriteAll(hwy::ThreadPool& pool, // Create/replace existing file. #if HWY_OS_WIN DWORD flags = FILE_ATTRIBUTE_NORMAL; - HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, flags, nullptr); + HANDLE file = CreateFileA(filename, GENERIC_WRITE, 0, nullptr, CREATE_ALWAYS, + flags, nullptr); if (file == INVALID_HANDLE_VALUE) return __LINE__; const int fd = _open_osfhandle(reinterpret_cast(file), _O_WRONLY); #else