mirror of https://github.com/google/gemma.cpp.git
142 lines
4.9 KiB
C++
142 lines
4.9 KiB
C++
// Copyright 2024 Google LLC
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// https://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Defines Gemma member functions which dynamic-dispatch into the SIMD
|
|
// implementations in gemma-inl.h.
|
|
|
|
#include "gemma/gemma.h"
|
|
|
|
// Compiles this file for multiple architectures via "foreach_target.h", to
|
|
// which we pass the filename via macro 'argument'.
|
|
// clang-format off
|
|
#undef HWY_TARGET_INCLUDE
|
|
#define HWY_TARGET_INCLUDE "gemma/gemma.cc" // NOLINT
|
|
// clang-format on
|
|
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
|
#include "hwy/highway.h"
|
|
// After highway.h
|
|
#include "gemma/gemma-inl.h"
|
|
|
|
#ifndef GEMMA_CC_ONCE
|
|
#define GEMMA_CC_ONCE
|
|
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include <vector>
|
|
|
|
// Placeholder for internal header, do not modify.
|
|
#include "gemma/configs.h"
|
|
#include "gemma/model_store.h"
|
|
#include "gemma/tokenizer.h"
|
|
#include "gemma/weights.h"
|
|
#include "io/blob_store.h"
|
|
#include "io/io.h" // Path
|
|
#include "ops/matmul.h"
|
|
#include "paligemma/image.h"
|
|
#include "util/threading_context.h"
|
|
#include "hwy/base.h"
|
|
|
|
#endif // GEMMA_CC_ONCE
|
|
|
|
#if HWY_ONCE
|
|
namespace gcpp {
|
|
HWY_EXPORT(GenerateSingleT);
|
|
HWY_EXPORT(GenerateBatchT);
|
|
HWY_EXPORT(GenerateImageTokensT);
|
|
|
|
// Internal init must run before I/O. This helper function takes care of that,
|
|
// plus calling `SetArgs`.
|
|
MatMulEnv MakeMatMulEnv(const ThreadingArgs& threading_args) {
|
|
// Placeholder for internal init, do not modify.
|
|
|
|
ThreadingContext::SetArgs(threading_args);
|
|
return MatMulEnv(ThreadingContext::Get());
|
|
}
|
|
|
|
Gemma::Gemma(const LoaderArgs& loader, const InferenceArgs& inference,
|
|
MatMulEnv& env)
|
|
: env_(env),
|
|
reader_(loader.weights),
|
|
model_(reader_, loader.tokenizer, loader.wrapping),
|
|
weights_(model_.Config()),
|
|
chat_template_(model_.Tokenizer(), model_.Config().model) {
|
|
weights_.ReadFromBlobs(model_, reader_, loader, inference, mat_owners_,
|
|
env.ctx.pools.Pool());
|
|
reader_.CloseFile();
|
|
}
|
|
|
|
Gemma::~Gemma() = default;
|
|
|
|
void Gemma::Save(const Path& weights_path, hwy::ThreadPool& pool) const {
|
|
BlobWriter writer;
|
|
const std::vector<uint32_t> serialized_mat_ptrs =
|
|
weights_.AddTensorDataToWriter(writer);
|
|
WriteSingleFile(model_.Config(), model_.Tokenizer(), serialized_mat_ptrs,
|
|
writer, env_.ctx.pools.Pool(), weights_path);
|
|
}
|
|
|
|
void Gemma::Generate(const RuntimeConfig& runtime_config,
|
|
const PromptTokens& prompt, size_t pos, size_t prefix_end,
|
|
KVCache& kv_cache, TimingInfo& timing_info) const {
|
|
env_.ctx.pools.MaybeStartSpinning(runtime_config.use_spinning);
|
|
|
|
HWY_DYNAMIC_DISPATCH(GenerateSingleT)(model_.Config(), weights_,
|
|
runtime_config, prompt, pos, prefix_end,
|
|
kv_cache, &env_, timing_info);
|
|
|
|
env_.ctx.pools.MaybeStopSpinning(runtime_config.use_spinning);
|
|
}
|
|
|
|
void Gemma::GenerateBatch(const RuntimeConfig& runtime_config,
|
|
const QueriesPromptTokens& queries_prompt,
|
|
const QueriesPos& queries_pos,
|
|
const QueriesPos& queries_prefix_end,
|
|
const KVCaches& kv_caches,
|
|
TimingInfo& timing_info) const {
|
|
// If we did not get passed prefix ends (size 0), assume 0 and pass that on.
|
|
QueriesPos mutable_queries_prefix_end = queries_prefix_end;
|
|
std::vector<size_t> prefix_end_vec;
|
|
if (queries_prefix_end.size() == 0) { // hwy::Span lacks empty()
|
|
prefix_end_vec.resize(queries_prompt.size(), 0);
|
|
mutable_queries_prefix_end =
|
|
QueriesPos(prefix_end_vec.data(), prefix_end_vec.size());
|
|
}
|
|
|
|
env_.ctx.pools.MaybeStartSpinning(runtime_config.use_spinning);
|
|
|
|
HWY_DYNAMIC_DISPATCH(GenerateBatchT)(
|
|
model_.Config(), weights_, runtime_config, queries_prompt, queries_pos,
|
|
mutable_queries_prefix_end, kv_caches, &env_, timing_info);
|
|
|
|
env_.ctx.pools.MaybeStopSpinning(runtime_config.use_spinning);
|
|
}
|
|
|
|
void Gemma::GenerateImageTokens(const RuntimeConfig& runtime_config,
|
|
const Image& image,
|
|
ImageTokens& image_tokens) const {
|
|
env_.ctx.pools.MaybeStartSpinning(runtime_config.use_spinning);
|
|
|
|
HWY_DYNAMIC_DISPATCH(GenerateImageTokensT)(
|
|
model_.Config(), weights_, runtime_config, image, image_tokens, &env_);
|
|
|
|
env_.ctx.pools.MaybeStopSpinning(runtime_config.use_spinning);
|
|
}
|
|
|
|
} // namespace gcpp
|
|
#endif // HWY_ONCE
|