mtmd: add more sanity checks (#21047)
This commit is contained in:
parent
20197b6fe3
commit
871f1a2d2f
|
|
@ -1377,6 +1377,16 @@ struct clip_model_loader {
|
|||
|
||||
// sanity check
|
||||
{
|
||||
if (hparams.image_size < 0) {
|
||||
// note: some models having hparams.image_size == 0, which means the image size is dynamic
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.patch_size <= 0) {
|
||||
throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
|
||||
}
|
||||
if (hparams.n_embd <= 0) {
|
||||
throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd));
|
||||
}
|
||||
if (hparams.image_max_pixels < hparams.image_min_pixels) {
|
||||
throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,23 +13,20 @@
|
|||
|
||||
constexpr bool DEBUG = false;
|
||||
|
||||
void mtmd_audio_cache::fill_sin_cos_table(int n) {
|
||||
void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) {
|
||||
sin_vals.resize(n);
|
||||
cos_vals.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
for (uint32_t i = 0; i < n; i++) {
|
||||
double theta = (2 * M_PI * i) / n;
|
||||
sin_vals[i] = sinf(theta);
|
||||
cos_vals[i] = cosf(theta);
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
|
||||
void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
|
||||
hann_window.resize(length);
|
||||
int offset = -1;
|
||||
if (periodic) {
|
||||
offset = 0;
|
||||
}
|
||||
for (int i = 0; i < length; i++) {
|
||||
int offset = periodic ? 0 : -1;
|
||||
for (uint32_t i = 0; i < length; i++) {
|
||||
hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
|
||||
}
|
||||
}
|
||||
|
|
@ -165,6 +162,7 @@ static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, fl
|
|||
// false = input is complex-valued (interleaved real/imag, stride 2)
|
||||
template <bool Inverse, bool RealInput>
|
||||
static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
|
||||
GGML_ASSERT(N > 0);
|
||||
const int n_sin_cos_vals = cache.sin_vals.size();
|
||||
|
||||
if (N == 1) {
|
||||
|
|
@ -407,6 +405,8 @@ static bool log_mel_spectrogram(
|
|||
}
|
||||
|
||||
|
||||
GGML_ASSERT(params.n_fft_bins > 0);
|
||||
GGML_ASSERT(params.hop_length > 0);
|
||||
out.n_mel = params.n_mel;
|
||||
out.n_len = (n_samples - frame_size) / frame_step + 1;
|
||||
// TODO: handle these checks better
|
||||
|
|
@ -438,6 +438,7 @@ static bool log_mel_spectrogram(
|
|||
|
||||
const int effective_n_len = n_samples_in / frame_step;
|
||||
if (params.norm_per_feature) {
|
||||
GGML_ASSERT(effective_n_len > 1);
|
||||
for (int i = 0; i < out.n_mel; i++) {
|
||||
double mean = 0;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
|
|
@ -639,6 +640,7 @@ mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length
|
|||
padding_to_remove((n_fft - hop_length) / 2),
|
||||
ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT
|
||||
ifft_out(n_fft * 2 * 4, 0.0f) {
|
||||
GGML_ASSERT(n_fft > 0 && hop_length > 0 && hop_length <= n_fft);
|
||||
cache.fill_sin_cos_table(n_fft);
|
||||
cache.fill_hann_window(n_fft, true);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,9 +33,9 @@ struct mtmd_audio_cache {
|
|||
|
||||
mtmd_audio_mel_filters filters;
|
||||
|
||||
void fill_sin_cos_table(int n);
|
||||
void fill_sin_cos_table(uint32_t n);
|
||||
|
||||
void fill_hann_window(int length, bool periodic);
|
||||
void fill_hann_window(uint32_t length, bool periodic);
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
|
|
|
|||
|
|
@ -127,6 +127,7 @@ struct decode_embd_batch {
|
|||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
GGML_ASSERT(n_tokens > 0 && n_pos_per_embd > 0 && n_mmproj_embd > 0);
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
|
|
@ -157,6 +158,7 @@ struct decode_embd_batch {
|
|||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
for (int x = 0; x < nx; x++) {
|
||||
|
|
@ -192,6 +194,7 @@ struct decode_embd_batch {
|
|||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
GGML_ASSERT(offset >= 0 && n_tokens > 0 && offset + n_tokens <= batch.n_tokens);
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
pos_view.reserve(n_tokens * n_pos_per_embd);
|
||||
|
|
@ -235,6 +238,7 @@ int32_t mtmd_helper_decode_image_chunk(
|
|||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
|
|
@ -312,6 +316,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
|||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past) {
|
||||
GGML_ASSERT(n_batch > 0);
|
||||
int32_t ret;
|
||||
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
|
||||
auto chunk_type = mtmd_input_chunk_get_type(chunk);
|
||||
|
|
@ -508,6 +513,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
|
|||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
if (file_size < 0) {
|
||||
LOG_ERR("Failed to get file size of %s\n", fname);
|
||||
fclose(f);
|
||||
return nullptr;
|
||||
}
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
|
|
|
|||
|
|
@ -99,6 +99,8 @@ struct img_tool {
|
|||
}
|
||||
|
||||
static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
|
||||
GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
|
||||
GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
|
||||
dst.nx = w;
|
||||
dst.ny = h;
|
||||
dst.buf.resize(3 * w * h);
|
||||
|
|
@ -196,6 +198,7 @@ struct img_tool {
|
|||
private:
|
||||
// Bilinear resize function
|
||||
static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
|
||||
GGML_ASSERT(src.nx >= 2 && src.ny >= 2);
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
|
@ -207,8 +210,8 @@ private:
|
|||
for (int x = 0; x < target_width; x++) {
|
||||
float px = x_ratio * x;
|
||||
float py = y_ratio * y;
|
||||
int x_floor = static_cast<int>(px);
|
||||
int y_floor = static_cast<int>(py);
|
||||
int x_floor = std::min(static_cast<int>(px), src.nx - 2);
|
||||
int y_floor = std::min(static_cast<int>(py), src.ny - 2);
|
||||
float x_lerp = px - x_floor;
|
||||
float y_lerp = py - y_floor;
|
||||
|
||||
|
|
@ -347,6 +350,7 @@ private:
|
|||
// Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
|
||||
auto precompute_weights = [&](int inSize, int outSize,
|
||||
std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
|
||||
GGML_ASSERT(inSize > 0 && outSize > 0);
|
||||
double support, scale, filterscale;
|
||||
double center, ww, ss;
|
||||
int xx, x, ksize, xmin, xmax, xcnt;
|
||||
|
|
|
|||
|
|
@ -641,6 +641,11 @@ struct mtmd_tokenizer {
|
|||
add_text(ctx->img_beg, true); // add image begin token
|
||||
}
|
||||
|
||||
// sanity check
|
||||
GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
|
||||
GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmap->nx;
|
||||
|
|
@ -649,7 +654,6 @@ struct mtmd_tokenizer {
|
|||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
|
||||
if (!ok) {
|
||||
|
|
@ -773,6 +777,11 @@ struct mtmd_tokenizer {
|
|||
add_text(ctx->aud_beg, true); // add audio begin token
|
||||
}
|
||||
|
||||
// sanity check
|
||||
GGML_ASSERT(ctx->audio_preproc != nullptr);
|
||||
GGML_ASSERT(bitmap->data.size() > sizeof(float));
|
||||
GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
|
||||
|
||||
// preprocess audio
|
||||
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
|
|
|
|||
Loading…
Reference in New Issue