Add C API and C# interop files

This change adds a basic C API that allows access to Gemma functionality from other programming languages. The functionality is exposed via a shared library (DLL on Windows), with C++ interfaces and a basic C# interop wrapper included. To build the DLL, use the `windows-dll` preset, which includes the C and C++ sources as follows: ``` cmake --preset windows-dll cmake --build --config Release --preset windows-dll -j 4 ``` This should generate a `gemma.dll` in `<build-dir>/Release`. To build for non-Windows, the appropriate C++ DLL linking will need to be done to generate a shared library for the target OS. PiperOrigin-RevId: 750246272
2025-04-22 10:35:12 -07:00 · 2025-04-22 10:35:12 -07:00 · ba10c88a94
parent f20da328de
commit ba10c88a94
11 changed files with 1327 additions and 8 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -428,6 +428,40 @@ cc_library(
    ],
 )
 cc_library(
    name = "gemma_shared_lib",
    srcs = [
        "gemma/bindings/c_api.cc",
        "gemma/bindings/context.cc",
    ],
    hdrs = [
        "gemma/bindings/c_api.h",
        "gemma/bindings/context.h",
    ],
    exec_properties = {
        # Avoid linker OOMs when building with sanitizer instrumentation.
        "mem": "28g",
    },
    deps = [
        ":allocator",
        ":basics",
        ":benchmark_helper",
        ":common",
        ":gemma_args",
        ":gemma_lib",
        ":kv_cache",
        ":mat",
        ":ops",
        ":threading",
        ":threading_context",
        ":tokenizer",
        ":weights",
        "//compression:shared",
        "//paligemma:image",
        "@highway//:hwy",
    ],
 )
 cc_library(
    name = "cross_entropy",
    srcs = ["evals/cross_entropy.cc"],
@ -465,6 +499,7 @@ cc_library(
        ":gemma_lib",
        ":ops",
        ":threading_context",
        ":tokenizer",
        "@google_benchmark//:benchmark",
        "//compression:compress",
        "@highway//:hwy",
@ -522,6 +557,7 @@ cc_binary(
        ":gemma_lib",
        ":ops",
        ":threading_context",
        ":tokenizer",
        "//compression:shared",
        "//paligemma:image",
        "@highway//:hwy",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -39,6 +39,7 @@ set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
 FetchContent_Declare(benchmark GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.8.2 EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(benchmark)
 # Base source files
 set(SOURCES
  compression/blob_store.cc
  compression/blob_store.h
@ -115,6 +116,17 @@ set(SOURCES
  util/topology.h
 )
 # Add C API sources only when building DLL
 if(BUILD_GEMMA_DLL)
  list(APPEND SOURCES
    gemma/bindings/context.h
    gemma/bindings/context.cc
    gemma/bindings/c_api.h
    gemma/bindings/c_api.cc
  )
  message(STATUS "Including C API files for DLL build")
 endif()
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "Release")
 endif()
@ -134,6 +146,33 @@ target_compile_definitions(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:_CRT_SECURE
 target_compile_options(libgemma PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
 install(TARGETS libgemma DESTINATION lib)
 # Shared library target for C# interop
 if(BUILD_GEMMA_DLL)
    add_library(gemma_shared SHARED ${SOURCES})
 set_property(TARGET gemma_shared PROPERTY CXX_STANDARD 17)
 set_target_properties(gemma_shared PROPERTIES 
    PREFIX ""
    OUTPUT_NAME "gemma"
 )
 set_property(TARGET gemma_shared PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(gemma_shared PUBLIC ./)
 target_link_libraries(gemma_shared PRIVATE 
    $<LINK_LIBRARY:WHOLE_ARCHIVE,hwy>
    $<LINK_LIBRARY:WHOLE_ARCHIVE,hwy_contrib>
    $<LINK_LIBRARY:WHOLE_ARCHIVE,sentencepiece-static>
 )
 target_include_directories(gemma_shared PUBLIC ${sentencepiece_SOURCE_DIR})
 target_compile_definitions(gemma_shared 
    PRIVATE 
    GEMMA_EXPORTS
    $<$<PLATFORM_ID:Windows>:_CRT_SECURE_NO_WARNINGS NOMINMAX>
 )
 target_compile_options(gemma_shared PRIVATE $<$<PLATFORM_ID:Windows>:-Wno-deprecated-declarations>)
 install(TARGETS gemma_shared DESTINATION lib)
 install(FILES gemma/c_api.h DESTINATION include/gemma)
 install(FILES gemma/GemmaInterop.cs DESTINATION include/gemma)
 endif()
 # Executable Target
 add_executable(gemma gemma/run.cc)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -31,6 +31,24 @@
          "lhs": "${hostSystemName}",
          "rhs": "Windows"
        }
      },
      {
        "name": "windows-dll",
        "inherits": "__defaults__",
        "displayName": "Windows DLL",
        "description": "Visual Studio 2022 with Clang/LLVM frontend (DLL build)",
        "generator": "Visual Studio 17 2022",
        "toolset": "ClangCL",
        "condition": {
          "type": "equals",
          "lhs": "${hostSystemName}",
          "rhs": "Windows"
        },
        "cacheVariables": {
          "BUILD_SHARED_LIBS": "OFF",
          "CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS": "ON",
          "BUILD_GEMMA_DLL": "ON"
        }
      }
    ],
    "buildPresets": [
@ -54,6 +72,15 @@
        "displayName": "Windows",
        "configuration": "Release",
        "configurePreset": "windows"
      },
      {
        "name": "windows-dll",
        "displayName": "Windows DLL",
        "configuration": "Release",
        "configurePreset": "windows-dll",
        "targets": [
            "gemma_shared"
        ]
      }
    ]
  }
--- a/evals/benchmark_helper.h
+++ b/evals/benchmark_helper.h
@ -25,6 +25,7 @@
 #include "gemma/gemma.h"
 #include "gemma/gemma_args.h"
 #include "gemma/tokenizer.h"  // WrapAndTokenize
 #include "ops/matmul.h"
 #include "util/threading_context.h"
 #include "hwy/base.h"
@ -54,8 +55,9 @@ class GemmaEnv {
  size_t MaxGeneratedTokens() const {
    return runtime_config_.max_generated_tokens;
  }
-  void SetMaxGeneratedTokens(size_t max_generated_tokens) {
+  void SetMaxGeneratedTokens(int max_generated_tokens) {
-    runtime_config_.max_generated_tokens = max_generated_tokens;
+    runtime_config_.max_generated_tokens =
        static_cast<size_t>(max_generated_tokens);
  }
  std::vector<int> Tokenize(const std::string& input) const {
--- a/gemma/bindings/GemmaInterop.cs
+++ b/gemma/bindings/GemmaInterop.cs
@ -0,0 +1,426 @@
 using System;
 using System.Diagnostics;
 using System.Runtime.InteropServices;
 using System.Text;
 namespace GemmaCpp
 {
    public class GemmaException : Exception
    {
        public GemmaException(string message) : base(message) { }
    }
    public class Gemma : IDisposable
    {
        private IntPtr _context;
        private bool _disposed;
        // Optional: Allow setting DLL path
        public static string DllPath { get; set; } = "gemma.dll";
        [DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
        private static extern IntPtr LoadLibrary(string lpFileName);
        static Gemma()
        {
            // Load DLL from specified path
            if (LoadLibrary(DllPath) == IntPtr.Zero)
            {
                throw new DllNotFoundException($"Failed to load {DllPath}. Error: {Marshal.GetLastWin32Error()}");
            }
        }
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern IntPtr GemmaCreate(
            [MarshalAs(UnmanagedType.LPUTF8Str)] string tokenizerPath,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string modelType,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string weightsPath,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string weightType,
            int maxLength);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaDestroy(IntPtr context);
        // Delegate type for token callbacks
        public delegate bool TokenCallback(string token);
        // Keep delegate alive for duration of calls
        private GCHandle _callbackHandle;
        [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
        private delegate bool GemmaTokenCallback(
            [MarshalAs(UnmanagedType.LPUTF8Str)] string text,
            IntPtr userData);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern int GemmaGenerate(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string prompt,
            [Out] byte[] output,
            int maxLength,
            GemmaTokenCallback callback,
            IntPtr userData);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern int GemmaGenerateMultimodal(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string prompt,
            IntPtr image_data, // Renamed param to match C API
            int image_width,   // Added dimension
            int image_height,  // Added dimension
            [MarshalAs(UnmanagedType.LPUTF8Str)] StringBuilder output, // Output should be StringBuilder for multimodal
            int maxLength,
            GemmaTokenCallback callback,
            IntPtr userData);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern int GemmaCountTokens(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string text);
        // Configuration function imports
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetMaxGeneratedTokens(IntPtr context, int value);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetMultiturn(IntPtr context, int value);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetTemperature(IntPtr context, float value);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetTopK(IntPtr context, int value);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetDeterministic(IntPtr context, int value);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetPrefillTbatchSize(IntPtr context, int value);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaResetConversation")]
        private static extern void GemmaResetConversation(IntPtr context);
        // Conversation management function imports
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaCreateConversation")]
        private static extern int GemmaCreateConversation(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaSwitchConversation")]
        private static extern int GemmaSwitchConversation(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaDeleteConversation")]
        private static extern int GemmaDeleteConversation(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl, EntryPoint = "GemmaHasConversation")]
        private static extern int GemmaHasConversation(
            IntPtr context,
            [MarshalAs(UnmanagedType.LPUTF8Str)] string conversationName);
        // Native callback delegate type
        [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
        private delegate void GemmaLogCallback(
            [MarshalAs(UnmanagedType.LPUTF8Str)] string message,
            IntPtr userData);
        [DllImport("gemma", CallingConvention = CallingConvention.Cdecl)]
        private static extern void GemmaSetLogCallback(
            IntPtr context,
            GemmaLogCallback callback,
            IntPtr userData);
        private GCHandle _logCallbackHandle;
        private bool _loggingEnabled = false;
        public Gemma(string tokenizerPath, string modelType, string weightsPath, string weightType, int maxLength = 8192)
        {
            _context = GemmaCreate(tokenizerPath, modelType, weightsPath, weightType, maxLength);
            if (_context == IntPtr.Zero)
            {
                throw new GemmaException("Failed to create Gemma context");
            }
        }
        // Enable debug logging
        public void EnableLogging(bool enable = true)
        {
            if (enable && !_loggingEnabled)
            {
                GemmaLogCallback logCallback = (message, _) =>
                {
                    Debug.WriteLine($"Gemma: {message}");
                };
                _logCallbackHandle = GCHandle.Alloc(logCallback);
                GemmaSetLogCallback(_context, logCallback, IntPtr.Zero);
                _loggingEnabled = true;
            }
            else if (!enable && _loggingEnabled)
            {
                if (_logCallbackHandle.IsAllocated)
                    _logCallbackHandle.Free();
                GemmaSetLogCallback(_context, null, IntPtr.Zero);
                _loggingEnabled = false;
            }
        }
        // Configuration methods
        public void SetMultiturn(bool enable)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            GemmaSetMultiturn(_context, enable ? 1 : 0);
            Debug.WriteLine($"Gemma: Set multiturn to {(enable ? "enabled" : "disabled")}");
        }
        public void SetTemperature(float temperature)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            GemmaSetTemperature(_context, temperature);
            Debug.WriteLine($"Gemma: Set temperature to {temperature}");
        }
        public void SetTopK(int topK)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            GemmaSetTopK(_context, topK);
            Debug.WriteLine($"Gemma: Set topK to {topK}");
        }
        public void SetDeterministic(bool deterministic)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            GemmaSetDeterministic(_context, deterministic ? 1 : 0);
            Debug.WriteLine($"Gemma: Set deterministic to {(deterministic ? "true" : "false")}");
        }
        // Renamed public method
        public void ResetConversation()
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            GemmaResetConversation(_context); // Call P/Invoke method
            Debug.WriteLine("Gemma: Reset active conversation");
        }
        // Conversation management methods
        public bool CreateConversation(string conversationName)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            bool result = GemmaCreateConversation(_context, conversationName) != 0; // Call P/Invoke method
            Debug.WriteLine($"Gemma: Create conversation '{conversationName}' - {(result ? "succeeded" : "failed")}");
            return result;
        }
        public bool SwitchConversation(string conversationName)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            bool result = GemmaSwitchConversation(_context, conversationName) != 0; // Call P/Invoke method
            Debug.WriteLine($"Gemma: Switch to conversation '{conversationName}' - {(result ? "succeeded" : "failed")}");
            return result;
        }
        public bool DeleteConversation(string conversationName)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            bool result = GemmaDeleteConversation(_context, conversationName) != 0; // Call P/Invoke method
            Debug.WriteLine($"Gemma: Delete conversation '{conversationName}' - {(result ? "succeeded" : "failed")}");
            return result;
        }
        public bool HasConversation(string conversationName)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            bool result = GemmaHasConversation(_context, conversationName) != 0; // Call P/Invoke method
            Debug.WriteLine($"Gemma: Has conversation '{conversationName}' - {result}");
            return result;
        }
        public int CountTokens(string prompt)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            int count = GemmaCountTokens(_context, prompt);
            return count;
        }
        public string Generate(string prompt, int maxLength = 4096)
        {
            return Generate(prompt, null, maxLength);
        }
        public string Generate(string prompt, TokenCallback callback, int maxLength = 4096)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            var outputBuffer = new byte[maxLength * 4];  // Allow for worst case UTF-8 size
            GemmaTokenCallback nativeCallback = null;
            // Track token count for debugging
            int tokenCount = 0;
            if (callback != null)
            {
                nativeCallback = (text, _) =>
                {
                    tokenCount++;
                    // Log token for debugging
                    Debug.WriteLine($"Token {tokenCount}: '{text}'");
                    // Pass token to user callback
                    return callback(text);
                };
                _callbackHandle = GCHandle.Alloc(nativeCallback);
            }
            try
            {
                int length = GemmaGenerate(_context, prompt, outputBuffer, maxLength,
                    nativeCallback, IntPtr.Zero);
                if (length < 0)
                    throw new GemmaException("Generation failed");
                Debug.WriteLine($"Generation complete: {tokenCount} tokens processed, result length: {length}");
                // Convert the byte buffer to a string using UTF-8 encoding
                string result = Encoding.UTF8.GetString(outputBuffer, 0, length);
                return result;
            }
            finally
            {
                if (_callbackHandle.IsAllocated)
                    _callbackHandle.Free();
            }
        }
        public string GenerateMultimodal(string prompt, float[] imageData, int imageWidth, int imageHeight, int maxLength = 4096)
        {
            // Pass width and height to the overloaded method
            return GenerateMultimodal(prompt, imageData, imageWidth, imageHeight, null, maxLength);
        }
        public string GenerateMultimodal(string prompt, float[] imageData, int imageWidth, int imageHeight, TokenCallback callback, int maxLength = 4096)
        {
            if (_disposed)
                throw new ObjectDisposedException(nameof(Gemma));
            if (_context == IntPtr.Zero)
                throw new GemmaException("Gemma context is invalid");
            if (imageData == null || imageData.Length == 0)
                throw new ArgumentException("Image data cannot be null or empty", nameof(imageData));
            if (imageWidth <= 0 || imageHeight <= 0)
                throw new ArgumentException("Image dimensions must be positive");
            if (imageData.Length < imageWidth * imageHeight * 3)
                throw new ArgumentException("Image data array is too small for the specified dimensions");
            var output = new StringBuilder(maxLength);
            GemmaTokenCallback nativeCallback = null;
            if (callback != null)
            {
                nativeCallback = (text, _) => callback(text);
                _callbackHandle = GCHandle.Alloc(nativeCallback);
            }
            // Pin the image data so it doesn't move during the native call
            GCHandle imageHandle = GCHandle.Alloc(imageData, GCHandleType.Pinned);
            try
            {
                IntPtr imagePtr = imageHandle.AddrOfPinnedObject();
                // Pass image dimensions to the native call
                int length = GemmaGenerateMultimodal(_context, prompt, imagePtr, imageWidth, imageHeight, output, maxLength,
                    nativeCallback, IntPtr.Zero);
                if (length < 0)
                    throw new GemmaException("Multimodal generation failed");
                return output.ToString();
            }
            finally
            {
                imageHandle.Free();
                if (_callbackHandle.IsAllocated)
                    _callbackHandle.Free();
            }
        }
        public void Dispose()
        {
            if (!_disposed)
            {
                if (_context != IntPtr.Zero)
                {
                    GemmaDestroy(_context);
                    _context = IntPtr.Zero;
                }
                if (_logCallbackHandle.IsAllocated)
                    _logCallbackHandle.Free();
                _disposed = true;
            }
        }
        ~Gemma()
        {
            Dispose();
        }
    }
 }
--- a/gemma/bindings/c_api.cc
+++ b/gemma/bindings/c_api.cc
@ -0,0 +1,128 @@
 // Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef GEMMA_EXPORTS
 #define GEMMA_EXPORTS
 #endif
 #include "gemma/bindings/c_api.h"
 extern "C" {
 GEMMA_API GemmaContext* GemmaCreate(const char* tokenizer_path,
                                    const char* model_type,
                                    const char* weights_path,
                                    const char* weight_type, int max_length) {
  try {
    GemmaContext* ctx = GemmaContext::Create(
        tokenizer_path, model_type, weights_path, weight_type, max_length);
    return ctx;
  } catch (...) {
    return nullptr;
  }
 }
 GEMMA_API void GemmaDestroy(GemmaContext* ctx) { delete ctx; }
 GEMMA_API int GemmaGenerate(GemmaContext* ctx, const char* prompt, char* output,
                            int max_length, GemmaTokenCallback callback,
                            void* user_data) {
  if (!ctx) return -1;
  return ctx->Generate(prompt, output, max_length, callback, user_data);
 }
 GEMMA_API int GemmaGenerateMultimodal(GemmaContext* ctx, const char* prompt,
                                      const void* image_data, int image_width,
                                      int image_height, char* output,
                                      int max_length,
                                      GemmaTokenCallback callback,
                                      void* user_data) {
  if (!ctx) return -1;
  return ctx->GenerateMultimodal(prompt, image_data, image_width, image_height,
                                 output, max_length, callback, user_data);
 }
 GEMMA_API int GemmaCountTokens(GemmaContext* ctx, const char* text) {
  if (!ctx || !text) return -1;
  return ctx->CountTokens(text);
 }
 GEMMA_API void GemmaSetLogCallback(GemmaContext* ctx, GemmaLogCallback callback,
                                   void* user_data) {
  if (!ctx) return;
  ctx->SetLogCallback(callback, user_data);
 }
 // Configuration functions implementation
 GEMMA_API void GemmaSetMaxGeneratedTokens(GemmaContext* ctx, int value) {
  if (!ctx) return;
  ctx->SetMaxGeneratedTokens(value);
 }
 GEMMA_API void GemmaSetMultiturn(GemmaContext* ctx, int value) {
  if (!ctx) return;
  ctx->SetMultiturn(value);
 }
 GEMMA_API void GemmaSetTemperature(GemmaContext* ctx, float value) {
  if (!ctx) return;
  ctx->SetTemperature(value);
 }
 GEMMA_API void GemmaSetTopK(GemmaContext* ctx, int value) {
  if (!ctx) return;
  ctx->SetTopK(value);
 }
 GEMMA_API void GemmaSetDeterministic(GemmaContext* ctx, int value) {
  if (!ctx) return;
  ctx->SetDeterministic(value != 0);
 }
 GEMMA_API void GemmaSetPrefillTbatchSize(GemmaContext* ctx, int value) {
  if (!ctx) return;
  ctx->SetPrefillTbatchSize(value);
 }
 GEMMA_API void GemmaResetConversation(GemmaContext* ctx) {  // Renamed function
  if (!ctx) return;
  ctx->ResetConversation();
 }
 GEMMA_API int GemmaCreateConversation(GemmaContext* ctx,
                                      const char* conversation_name) {
  if (!ctx || !conversation_name) return 0;
  return ctx->CreateConversation(conversation_name) ? 1 : 0;
 }
 GEMMA_API int GemmaSwitchConversation(GemmaContext* ctx,
                                      const char* conversation_name) {
  if (!ctx || !conversation_name) return 0;
  return ctx->SwitchConversation(conversation_name) ? 1 : 0;
 }
 GEMMA_API int GemmaDeleteConversation(GemmaContext* ctx,
                                      const char* conversation_name) {
  if (!ctx || !conversation_name) return 0;
  return ctx->DeleteConversation(conversation_name) ? 1 : 0;
 }
 GEMMA_API int GemmaHasConversation(GemmaContext* ctx,
                                   const char* conversation_name) {
  if (!ctx || !conversation_name) return 0;
  return ctx->HasConversation(conversation_name) ? 1 : 0;
 }
 }
--- a/gemma/bindings/c_api.h
+++ b/gemma/bindings/c_api.h
@ -0,0 +1,86 @@
 // Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef THIRD_PARTY_GEMMA_C_API_H_
 #define THIRD_PARTY_GEMMA_C_API_H_
 #include "gemma/bindings/context.h"
 #ifdef _WIN32
 #ifdef GEMMA_EXPORTS
 #define GEMMA_API __declspec(dllexport)
 #else
 #define GEMMA_API __declspec(dllimport)
 #endif
 #else
 #define GEMMA_API __attribute__((visibility("default")))
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifdef __cplusplus
 typedef gcpp::GemmaContext GemmaContext;
 #else
 typedef struct GemmaContext GemmaContext;
 #endif
 typedef bool (*GemmaTokenCallback)(const char* text, void* user_data);
 typedef void (*GemmaLogCallback)(const char* message, void* user_data);
 GEMMA_API GemmaContext* GemmaCreate(const char* tokenizer_path,
                                    const char* model_type,
                                    const char* weights_path,
                                    const char* weight_type, int max_length);
 GEMMA_API void GemmaDestroy(GemmaContext* ctx);
 GEMMA_API int GemmaGenerate(GemmaContext* ctx, const char* prompt, char* output,
                            int max_length, GemmaTokenCallback callback,
                            void* user_data);
 GEMMA_API int GemmaGenerateMultimodal(GemmaContext* ctx, const char* prompt,
                                      const void* image_data,  // Renamed param
                                      int image_width,   // Added dimension
                                      int image_height,  // Added dimension
                                      char* output, int max_length,
                                      GemmaTokenCallback callback,
                                      void* user_data);
 GEMMA_API int GemmaCountTokens(GemmaContext* ctx, const char* text);
 GEMMA_API void GemmaSetLogCallback(GemmaContext* ctx, GemmaLogCallback callback,
                                   void* user_data);
 // Configuration functions
 GEMMA_API void GemmaSetMultiturn(GemmaContext* ctx, int value);
 GEMMA_API void GemmaSetTemperature(GemmaContext* ctx, float value);
 GEMMA_API void GemmaSetTopK(GemmaContext* ctx, int value);
 GEMMA_API void GemmaSetDeterministic(GemmaContext* ctx, int value);
 GEMMA_API void GemmaResetConversation(GemmaContext* ctx);  // Renamed
 // Conversation management functions (renamed)
 GEMMA_API int GemmaCreateConversation(
    GemmaContext* ctx, const char* conversation_name);  // Renamed
 GEMMA_API int GemmaSwitchConversation(
    GemmaContext* ctx, const char* conversation_name);  // Renamed
 GEMMA_API int GemmaDeleteConversation(
    GemmaContext* ctx, const char* conversation_name);  // Renamed
 GEMMA_API int GemmaHasConversation(GemmaContext* ctx,
                                   const char* conversation_name);  // Renamed
 #ifdef __cplusplus
 }
 #endif
 #endif  // THIRD_PARTY_GEMMA_C_API_H_
--- a/gemma/bindings/context.cc
+++ b/gemma/bindings/context.cc
@ -0,0 +1,331 @@
 // Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gemma/bindings/context.h"
 #include <cstddef>
 #include <cstring>
 #include <memory>
 #include <sstream>
 #include <vector>
 #include "evals/benchmark_helper.h"  // InitGenerator
 #include "gemma/gemma.h"
 #include "gemma/gemma_args.h"
 #include "gemma/tokenizer.h"  // WrapAndTokenize
 #include "util/threading.h"
 #include "util/threading_context.h"
 #include "hwy/profiler.h"
 #include "hwy/timer.h"
 #ifdef _WIN32
 #include <Windows.h>
 #endif
 #include "gemma/kv_cache.h"
 #include "paligemma/image.h"
 namespace gcpp {
 // ConversationData constructor implementation
 ConversationData::ConversationData(const ModelConfig& model_config,
                                   size_t prefill_tbatch_size)
    : kv_cache(std::make_unique<KVCache>(
          KVCache::Create(model_config, prefill_tbatch_size))),
      abs_pos(0) {}
 // Initialize static members
 GemmaLogCallback GemmaContext::s_log_callback = nullptr;
 void* GemmaContext::s_log_user_data = nullptr;
 GemmaContext* GemmaContext::Create(const char* tokenizer_path,
                                   const char* model_type,
                                   const char* weights_path,
                                   const char* weight_type, int max_length) {
  std::stringstream ss;
  ss << "Creating GemmaContext with tokenizer_path: "
     << (tokenizer_path ? tokenizer_path : "null")
     << ", model_type: " << (model_type ? model_type : "null")
     << ", weights_path: " << (weights_path ? weights_path : "null")
     << ", weight_type: " << (weight_type ? weight_type : "null")
     << ", max_length: " << max_length;
  LogDebug(ss.str().c_str());
  ThreadingArgs threading_args;
  threading_args.spin = gcpp::Tristate::kFalse;
  LoaderArgs loader(tokenizer_path, weights_path, model_type);
  loader.weight_type_str = weight_type;
  LogDebug("LoaderArgs created");
  if (const char* error = loader.Validate()) {
    ss.str("");
    ss << "Invalid loader configuration: " << error;
    LogDebug(ss.str().c_str());
    HWY_ABORT("Invalid loader configuration: %s", error);
  }
  LogDebug("Loader validated successfully");
  // Initialize cached args
  LogDebug("Initializing inference args");
  InferenceArgs inference_args;
  inference_args.Init();
  inference_args.max_generated_tokens = max_length;
  inference_args.temperature = 0.7f;
  inference_args.top_k = 1;
  inference_args.deterministic = false;
  ss.str("");
  ss << "Inference args initialized with max_tokens: " << max_length
     << ", temperature: " << inference_args.temperature
     << ", top_k: " << inference_args.top_k << ", deterministic: "
     << (inference_args.deterministic ? "true" : "false");
  LogDebug(ss.str().c_str());
  return new GemmaContext(loader, inference_args, threading_args, max_length);
 }
 GemmaContext::GemmaContext(const LoaderArgs& loader,
                           const InferenceArgs& inference_args,
                           const ThreadingArgs& threading_args, int max_length)
    : inference_args(inference_args),
      threading_args(threading_args),
      matmul_env(MakeMatMulEnv(threading_args)),
      model(CreateGemma(loader, matmul_env)) {
  std::stringstream ss;
  LogDebug("Creating initial ConversationData");
  // Create the initial ConversationData object using make_shared
  active_conversation = std::make_shared<ConversationData>(
      model.GetModelConfig(), inference_args.prefill_tbatch_size);
  LogDebug(
      "Storing initial ConversationData in conversation_cache[\"default\"]");
  // Store the shared_ptr in the map under the "default" key
  conversation_cache["default"] = active_conversation;
  LogDebug("GemmaContext constructor completed");
 }
 // Internal implementation shared by Generate and GenerateMultimodal
 int GemmaContext::GenerateInternal(const char* prompt_string,
                                   const void* image_data, int image_width,
                                   int image_height, char* output,
                                   int max_length, GemmaTokenCallback callback,
                                   void* user_data) {
  PROFILER_ZONE("Gen.Internal");
  size_t tokens_generated_this_turn = 0;  // differentiates prefill from reply
  size_t prompt_size = 0;
  std::stringstream ss;
  result_buffer.clear();
  InitGenerator(inference_args, gen);
  // Ensure we have an active conversation
  if (!active_conversation || !active_conversation->kv_cache) {
    LogDebug("Generate called with null active_conversation or kv_cache");
    return -1;
  }
  // callback function invoked for each generated token.
  auto stream_token = [&, callback, user_data](int token, float) {
    // Use abs_pos from the active conversation
    ++(active_conversation->abs_pos);
    const bool in_prompt = tokens_generated_this_turn < prompt_size;
    const bool first_response_token = tokens_generated_this_turn == prompt_size;
    ++tokens_generated_this_turn;
    if (in_prompt || model.GetModelConfig().IsEOS(token)) {
      return true;
    }
    std::string token_text;
    HWY_ASSERT(model.Tokenizer().Decode(std::vector<int>{token}, &token_text));
    if (first_response_token) {
      token_text.erase(0, token_text.find_first_not_of(" \t\n"));
    }
    // if we have a managed callback, pass it the token text
    if (callback) {
      if (!callback(token_text.c_str(), user_data)) {
        LogDebug("Callback returned false, stopping generation");
        return false;
      }
    }
    result_buffer.append(token_text);
    return true;
  };
  // set up runtime config
  TimingInfo timing_info = {};
  RuntimeConfig runtime_config = {.gen = &gen,
                                  .stream_token = stream_token,
                                  .use_spinning = threading_args.spin};
  inference_args.CopyTo(runtime_config);
  size_t prefix_end = 0;
  // generate
  std::vector<int> prompt;
  ImageTokens image_tokens;
  if (image_data != nullptr) {
    size_t pool_dim = model.GetModelConfig().vit_config.pool_dim;
    image_tokens =
        ImageTokens(model.Env().ctx.allocator,
                    Extents2D(model.GetModelConfig().vit_config.seq_len /
                                  (pool_dim * pool_dim),
                              model.GetModelConfig().model_dim));
    HWY_ASSERT(model.Info().wrapping == PromptWrapping::PALIGEMMA ||
               model.Info().wrapping == PromptWrapping::GEMMA_VLM);
    Image image;
    image.Set(image_width, image_height, static_cast<const float*>(image_data));
    // We may need to resize the supplied image depending on whether we're using
    // PaliGemma or Gemma 3.
    const size_t image_size = model.GetModelConfig().vit_config.image_size;
    image.Resize(image_size, image_size);
    // Use the existing runtime_config defined earlier in the function.
    // RuntimeConfig runtime_config = { ... }; // This was already defined
    double image_tokens_start = hwy::platform::Now();
    // Pass the populated image object to GenerateImageTokens
    model.GenerateImageTokens(runtime_config, image, image_tokens);
    double image_tokens_duration = hwy::platform::Now() - image_tokens_start;
    ss.str("");
    ss << "\n\n[ Timing info ] Image token generation took: ";
    ss << static_cast<int>(image_tokens_duration * 1000) << " ms\n",
        LogDebug(ss.str().c_str());
    prompt = WrapAndTokenize(model.Tokenizer(), model.ChatTemplate(),
                             model.Info(), active_conversation->abs_pos,
                             prompt_string, image_tokens.BatchSize());
    runtime_config.image_tokens = &image_tokens;
    prompt_size = prompt.size();
    // The end of the prefix for prefix-LM style attention in Paligemma.
    // See Figure 2 of https://arxiv.org/abs/2407.07726.
    prefix_end = prompt_size;
  } else {
    // Text-only case (original logic)
    // Use abs_pos from the active conversation
    prompt =
        WrapAndTokenize(model.Tokenizer(), model.ChatTemplate(), model.Info(),
                        active_conversation->abs_pos, prompt_string);
    prompt_size = prompt.size();
  }
  // Check if prompt generation failed (e.g., multimodal not implemented yet)
  if (prompt.empty() && image_data != nullptr) {
    // Already logged the error, just ensure we don't proceed.
    return -1;
  }
  // Pass the KVCache object by reference from the active conversation
  model.Generate(runtime_config, prompt, active_conversation->abs_pos,
                 prefix_end, *(active_conversation->kv_cache), timing_info);
  // prepare for next turn
  if (!inference_args.multiturn ||
      model.Info().wrapping == PromptWrapping::PALIGEMMA) {
    // If not multiturn, or Paligemma (which handles turns differently),
    // reset the *active* conversation's position.
    active_conversation->abs_pos = 0;
    InitGenerator(inference_args, gen);
  } else {
    // Multi-turn Gemma: Rewind position in the active conversation
    // The last token was either EOS, then it should be ignored because it is
    // never part of the dialog, see Table 5 in the Gemma-2 paper:
    // https://arxiv.org/pdf/2408.00118
    // Or we have hit max_generated_tokens, then the last token will be lost.
    // (We could store it in stream_token, and then prepend to the next turn,
    // but it's not worth the complexity, as multi-turn with max_generated is
    // not a common use case.)
    // In either case, we need to rewind the active conversation's abs_pos by
    // one.
    HWY_ASSERT(active_conversation->abs_pos > 0);
    active_conversation->abs_pos--;
  }
  // Copy result buffer to output C-string (ensure null termination)
  strncpy(output, result_buffer.c_str(), max_length - 1);
  output[max_length - 1] = '\0';  // Explicit null termination
  return static_cast<int>(strlen(output));  // Return length of the C-string
 }
 // Public Generate method (wrapper for text-only)
 int GemmaContext::Generate(const char* prompt_string, char* output,
                           int max_length, GemmaTokenCallback callback,
                           void* user_data) {
  // Call the internal implementation with null image_data and 0 dimensions
  return GenerateInternal(prompt_string, nullptr, 0, 0, output, max_length,
                          callback, user_data);
 }
 // Public GenerateMultimodal method (wrapper)
 int GemmaContext::GenerateMultimodal(const char* prompt_string,
                                     const void* image_data, int image_width,
                                     int image_height,  // Added dimensions
                                     char* output, int max_length,
                                     GemmaTokenCallback callback,
                                     void* user_data) {
  if (image_data == nullptr) {
    LogDebug(
        "GenerateMultimodal called with null image_data. Use Generate for "
        "text-only.");
    // Or potentially call GenerateInternal with null image_data anyway?
    // Returning error seems safer.
    return -1;
  }
  return GenerateInternal(prompt_string, image_data, image_width, image_height,
                          output, max_length, callback, user_data);
 }
 int GemmaContext::CountTokens(const char* text) {
  LogDebug("CountTokens method started");
  std::stringstream ss;
  ss << "CountTokens called with text: '" << (text ? text : "null") << "'";
  LogDebug(ss.str().c_str());
  if (!text) {
    LogDebug("CountTokens failed: Invalid parameters");
    if (!text) LogDebug("  text is null");
    return -1;
  }
  try {
    LogDebug("Creating text string");
    std::string text_str(text);
    LogDebug("Creating tokens vector");
    std::vector<int> tokens;
    LogDebug("Encoding text to tokens");
    HWY_ASSERT(model.Tokenizer().Encode(text_str, &tokens));
    ss.str("");
    ss << "Text tokenized into " << tokens.size() << " tokens";
    LogDebug(ss.str().c_str());
    LogDebug("CountTokens completed successfully");
    return static_cast<int>(tokens.size());
  } catch (...) {
    LogDebug("Unknown exception in CountTokens");
    return -1;
  }
 }
 }  // namespace gcpp
--- a/gemma/bindings/context.h
+++ b/gemma/bindings/context.h
@ -0,0 +1,249 @@
 // Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef THIRD_PARTY_GEMMA_CPP_GEMMA_CONTEXT_H_
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_CONTEXT_H_
 #include <memory>  // For std::shared_ptr, std::make_shared
 #include <random>
 #include <string>
 #include <unordered_map>
 #include <vector>
 // Logging
 #ifdef _WIN32
 #include <windows.h>
 #else
 #include <stdio.h>
 #endif
 #include "gemma/common.h"
 #include "gemma/gemma.h"
 #include "gemma/gemma_args.h"
 #include "ops/matmul.h"  // MatMulEnv
 #include "hwy/base.h"
 #include "hwy/highway.h"
 namespace gcpp {
 // Forward declaration - use 'struct' to match definition tag
 struct KVCache;
 // Struct to hold data for a single conversation thread
 struct ConversationData {
  std::unique_ptr<KVCache> kv_cache;
  size_t abs_pos = 0;
  // Constructor to initialize kv_cache (requires KVCache definition or forward
  // declaration)
  ConversationData(const ModelConfig& model_config, size_t prefill_tbatch_size);
 };
 typedef bool (*GemmaTokenCallback)(const char* text, void* user_data);
 typedef void (*GemmaLogCallback)(const char* message, void* user_data);
 class GemmaContext {
 private:
  GemmaContext(const LoaderArgs& loader, const InferenceArgs& inference_args,
               const ThreadingArgs& threading_args, int max_length);
 public:
  static GemmaContext* Create(const char* tokenizer_path,
                              const char* model_type, const char* weights_path,
                              const char* weight_type, int max_length);
  // Returns length of generated text, or -1 on error
  int Generate(const char* prompt_string, char* output, int max_length,
               GemmaTokenCallback callback, void* user_data);
  // Returns length of generated text, or -1 on error
  int GenerateMultimodal(const char* prompt_string, const void* image_data,
                         int image_width, int image_height, char* output,
                         int max_length, GemmaTokenCallback callback,
                         void* user_data);
  // Returns number of tokens in text, or -1 on error
  int CountTokens(const char* text);
  // Add new method to set logger
  static void SetLogCallback(GemmaLogCallback callback, void* user_data) {
    s_log_callback = callback;
    s_log_user_data = user_data;
  }
  // Set max generated tokens
  void SetMaxGeneratedTokens(size_t value) {
    inference_args.max_generated_tokens = value;
    LogDebug("Setting max_generated_tokens to configured value");
  }
  // Set multiturn flag (0 = disabled, 1 = enabled)
  void SetMultiturn(int value) {
    inference_args.multiturn = value;
    LogDebug("Setting multiturn to configured value");
  }
  // Set temperature for token generation
  void SetTemperature(float value) {
    inference_args.temperature = value;
    LogDebug("Setting temperature to configured value");
  }
  // Set top_k parameter for sampling
  void SetTopK(int value) {
    inference_args.top_k = value;
    LogDebug("Setting top_k to configured value");
  }
  // Set deterministic flag
  void SetDeterministic(bool value) {
    inference_args.deterministic = value;
    // Reset the random number generator for deterministic generation
    if (value) {
      gen.seed(0x87654321);
    }
    LogDebug("Setting deterministic flag to configured value");
  }
  // Set prefill_tbatch_size
  void SetPrefillTbatchSize(size_t value) {
    inference_args.prefill_tbatch_size = value;
    LogDebug("Setting prefill_tbatch_size to configured value");
  }
  // Reset the currently active conversation
  void ResetConversation() {
    if (active_conversation) {
      LogDebug("Resetting active conversation");
      active_conversation->abs_pos = 0;
      // Replace the cache within the current ConversationData object
      active_conversation->kv_cache = std::make_unique<KVCache>(KVCache::Create(
          model.GetModelConfig(), inference_args.prefill_tbatch_size));
      LogDebug("Active conversation reset");
    } else {
      LogDebug("Cannot reset conversation: active_conversation is null");
    }
  }
  // Create a new named conversation
  bool CreateConversation(const char* conversation_name) {
    std::string name(conversation_name);
    if (conversation_cache.count(name)) {
      LogDebug("Conversation already exists");
      return false;
    }
    LogDebug("Creating new conversation");
    // Create a new ConversationData object using make_shared
    conversation_cache[name] = std::make_shared<ConversationData>(
        model.GetModelConfig(), inference_args.prefill_tbatch_size);
    return true;
  }
  // Switch to a named conversation
  bool SwitchConversation(const char* conversation_name) {
    std::string name(conversation_name);
    auto it = conversation_cache.find(name);
    if (it == conversation_cache.end()) {
      LogDebug("Conversation not found");
      return false;
    }
    LogDebug("Switching active conversation");
    active_conversation = it->second;
    return true;
  }
  // Delete a named conversation
  bool DeleteConversation(const char* conversation_name) {
    std::string name(conversation_name);
    auto it = conversation_cache.find(name);
    if (it == conversation_cache.end()) {
      LogDebug("Conversation not found for deletion");
      return false;
    }
    if (name == "default") {
      LogDebug("Cannot delete the default conversation");
      return false;
    }
    if (it->second == active_conversation) {
      LogDebug("Cannot delete the currently active conversation");
      return false;
    }
    LogDebug("Deleting conversation");
    conversation_cache.erase(it);
    return true;
  }
  // Check if a named conversation exists
  bool HasConversation(const char* conversation_name) {
    std::string name(conversation_name);
    return conversation_cache.count(name);
  }
 private:
  // Internal implementation shared by Generate and GenerateMultimodal
  int GenerateInternal(const char* prompt_string,
                       const void* image_data,  // Null for text-only generation
                       int image_width,   // Added dimension (0 if no image)
                       int image_height,  // Added dimension (0 if no image)
                       char* output, int max_length,
                       GemmaTokenCallback callback, void* user_data);
  // Pointer to the currently active conversation's data
  std::shared_ptr<ConversationData> active_conversation;
  // Cache of all named conversations
  std::unordered_map<std::string, std::shared_ptr<ConversationData>>
      conversation_cache;
  // Buffers (potentially could be moved into ConversationData if needed
  // per-conversation)
  std::string prompt_buffer;
  std::string result_buffer;
  std::vector<int> token_buffer;
  // Cached args (remain global for the context)
  InferenceArgs inference_args;
  ThreadingArgs threading_args;
  MatMulEnv matmul_env;
  // Model itself (don't move this, needs to be below the args above)
  Gemma model;
  // Random generator (remains global for the context)
  std::mt19937 gen;
  // Static members for logging
  static GemmaLogCallback s_log_callback;
  static void* s_log_user_data;
  // Use logging helper method to print messages into a managed callback if
  // necessary
  static void LogDebug(const char* message) {
    if (s_log_callback) {
      s_log_callback(message, s_log_user_data);
    } else {
 #ifdef _WIN32
      OutputDebugStringA(message);
 #else
      printf("%s", message);
 #endif
    }
  }
 };
 }  // namespace gcpp
 #endif  // THIRD_PARTY_GEMMA_CPP_GEMMA_CONTEXT_H_
--- a/gemma/gemma.h
+++ b/gemma/gemma.h
@ -271,12 +271,6 @@ class Gemma {
  ModelWeightsStorage model_;
 };
 // Adds BOS token and possibly 'turn' annotations, which depend on `info`
 // and `pos`, the number of tokens decoded so far; returns the corresponding
 // tokens. Asserts that tokenization is successful.
 std::vector<int> WrapAndTokenize(const GemmaTokenizer& tokenizer,
                                 const ModelInfo& info, size_t pos,
                                 std::string& prompt);
 void RangeChecks(const ModelConfig& weights_config,
                 size_t& max_generated_tokens, size_t prompt_size);
--- a/gemma/run.cc
+++ b/gemma/run.cc
@ -28,6 +28,7 @@
 #include "gemma/common.h"
 #include "gemma/gemma.h"  // Gemma
 #include "gemma/gemma_args.h"
 #include "gemma/tokenizer.h"   // WrapAndTokenize
 #include "hwy/base.h"
 #include "hwy/highway.h"
 #include "hwy/profiler.h"