mtmd: format code
This commit is contained in:
parent
f629d02ee1
commit
5a741fda55
|
|
@ -3156,90 +3156,89 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||||
{
|
|
||||||
const std::vector native_resolutions = {
|
|
||||||
/*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
|
|
||||||
};
|
|
||||||
// original image size
|
|
||||||
const int orig_w = original_size.width;
|
|
||||||
const int orig_h = original_size.height;
|
|
||||||
const int orig_area = orig_h * orig_w;
|
|
||||||
std::array<uint8_t, 3u> color;
|
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
color[i] = (int)(255 * params.image_mean[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t mode_i = 0;
|
|
||||||
int min_diff = orig_area;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < native_resolutions.size(); i++) {
|
|
||||||
int r = native_resolutions[i];
|
|
||||||
if (std::abs(orig_area - r * r) < min_diff) {
|
|
||||||
mode_i = i;
|
|
||||||
min_diff = std::abs(orig_area - r * r);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Native Resolution (Base/Large) */
|
|
||||||
const int image_size = native_resolutions[mode_i];
|
|
||||||
|
|
||||||
// Resize maintaining aspect ratio, then pad to square
|
|
||||||
float scale = std::min(
|
|
||||||
static_cast<float>(image_size) / orig_w,
|
|
||||||
static_cast<float>(image_size) / orig_h
|
|
||||||
);
|
|
||||||
int new_w = static_cast<int>(orig_w * scale);
|
|
||||||
int new_h = static_cast<int>(orig_h * scale);
|
|
||||||
|
|
||||||
clip_image_u8_ptr scaled_img(clip_image_u8_init());
|
|
||||||
img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h},
|
|
||||||
img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color);
|
|
||||||
|
|
||||||
// Use mean color for padding
|
|
||||||
unsigned char pad_r = static_cast<unsigned char>(params.image_mean[0] * 255.0f);
|
|
||||||
unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
|
|
||||||
unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
|
|
||||||
|
|
||||||
// Pad to image_size × image_size (center padding)
|
|
||||||
clip_image_u8_ptr padded_img(clip_image_u8_init());
|
|
||||||
padded_img->nx = image_size;
|
|
||||||
padded_img->ny = image_size;
|
|
||||||
padded_img->buf.resize(image_size * image_size * 3); // black padding
|
|
||||||
|
|
||||||
// Fill with mean color
|
|
||||||
for (int i = 0; i < image_size * image_size; ++i)
|
|
||||||
{
|
{
|
||||||
padded_img->buf[i * 3 + 0] = pad_r;
|
const std::vector native_resolutions = {
|
||||||
padded_img->buf[i * 3 + 1] = pad_g;
|
/*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
|
||||||
padded_img->buf[i * 3 + 2] = pad_b;
|
};
|
||||||
}
|
// original image size
|
||||||
|
const int orig_w = original_size.width;
|
||||||
|
const int orig_h = original_size.height;
|
||||||
|
const int orig_area = orig_h * orig_w;
|
||||||
|
std::array<uint8_t, 3u> color;
|
||||||
|
|
||||||
// Calculate padding offsets (center the image)
|
for (int i = 0; i < 3; i++) {
|
||||||
int pad_x = (image_size - new_w) / 2;
|
color[i] = (int)(255 * params.image_mean[i]);
|
||||||
int pad_y = (image_size - new_h) / 2;
|
|
||||||
|
|
||||||
// Copy scaled image into padded canvas
|
|
||||||
for (int y = 0; y < new_h; ++y){
|
|
||||||
for (int x = 0; x < new_w; ++x){
|
|
||||||
int src_idx = (y * new_w + x) * 3;
|
|
||||||
int dst_idx = ((y + pad_y) * image_size + (x + pad_x)) * 3;
|
|
||||||
padded_img->buf[dst_idx + 0] = scaled_img->buf[src_idx + 0];
|
|
||||||
padded_img->buf[dst_idx + 1] = scaled_img->buf[src_idx + 1];
|
|
||||||
padded_img->buf[dst_idx + 2] = scaled_img->buf[src_idx + 2];
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Normalize and output
|
size_t mode_i = 0;
|
||||||
clip_image_f32_ptr res(clip_image_f32_init());
|
int min_diff = orig_area;
|
||||||
normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std);
|
|
||||||
res_imgs->entries.push_back(std::move(res));
|
|
||||||
|
|
||||||
res_imgs->grid_x = 1;
|
for (size_t i = 0; i < native_resolutions.size(); i++) {
|
||||||
res_imgs->grid_y = 1;
|
int r = native_resolutions[i];
|
||||||
}
|
if (std::abs(orig_area - r * r) < min_diff) {
|
||||||
break;
|
mode_i = i;
|
||||||
|
min_diff = std::abs(orig_area - r * r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Native Resolution (Base/Large) */
|
||||||
|
const int image_size = native_resolutions[mode_i];
|
||||||
|
|
||||||
|
// Resize maintaining aspect ratio, then pad to square
|
||||||
|
float scale = std::min(
|
||||||
|
static_cast<float>(image_size) / orig_w,
|
||||||
|
static_cast<float>(image_size) / orig_h
|
||||||
|
);
|
||||||
|
int new_w = static_cast<int>(orig_w * scale);
|
||||||
|
int new_h = static_cast<int>(orig_h * scale);
|
||||||
|
|
||||||
|
clip_image_u8_ptr scaled_img(clip_image_u8_init());
|
||||||
|
img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h},
|
||||||
|
img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color);
|
||||||
|
|
||||||
|
// Use mean color for padding
|
||||||
|
unsigned char pad_r = static_cast<unsigned char>(params.image_mean[0] * 255.0f);
|
||||||
|
unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
|
||||||
|
unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
|
||||||
|
|
||||||
|
// Pad to image_size × image_size (center padding)
|
||||||
|
clip_image_u8_ptr padded_img(clip_image_u8_init());
|
||||||
|
padded_img->nx = image_size;
|
||||||
|
padded_img->ny = image_size;
|
||||||
|
padded_img->buf.resize(image_size * image_size * 3); // black padding
|
||||||
|
|
||||||
|
// Fill with mean color
|
||||||
|
for (int i = 0; i < image_size * image_size; ++i)
|
||||||
|
{
|
||||||
|
padded_img->buf[i * 3 + 0] = pad_r;
|
||||||
|
padded_img->buf[i * 3 + 1] = pad_g;
|
||||||
|
padded_img->buf[i * 3 + 2] = pad_b;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate padding offsets (center the image)
|
||||||
|
int pad_x = (image_size - new_w) / 2;
|
||||||
|
int pad_y = (image_size - new_h) / 2;
|
||||||
|
|
||||||
|
// Copy scaled image into padded canvas
|
||||||
|
for (int y = 0; y < new_h; ++y){
|
||||||
|
for (int x = 0; x < new_w; ++x){
|
||||||
|
int src_idx = (y * new_w + x) * 3;
|
||||||
|
int dst_idx = ((y + pad_y) * image_size + (x + pad_x)) * 3;
|
||||||
|
padded_img->buf[dst_idx + 0] = scaled_img->buf[src_idx + 0];
|
||||||
|
padded_img->buf[dst_idx + 1] = scaled_img->buf[src_idx + 1];
|
||||||
|
padded_img->buf[dst_idx + 2] = scaled_img->buf[src_idx + 2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize and output
|
||||||
|
clip_image_f32_ptr res(clip_image_f32_init());
|
||||||
|
normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std);
|
||||||
|
res_imgs->entries.push_back(std::move(res));
|
||||||
|
|
||||||
|
res_imgs->grid_x = 1;
|
||||||
|
res_imgs->grid_y = 1;
|
||||||
|
} break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
|
LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
|
||||||
|
|
|
||||||
|
|
@ -89,9 +89,8 @@ static ggml_tensor * get_rel_pos(ggml_context * ctx0,
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * clip_graph_deepseekocr::build() {
|
ggml_cgraph * clip_graph_deepseekocr::build() {
|
||||||
//patch embedding
|
// patch embedding
|
||||||
ggml_tensor * inp_raw = build_inp_raw();
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
//ggml_tensor * sam_out = build_sam(inp_raw);
|
|
||||||
|
|
||||||
ggml_tensor * sam_out;
|
ggml_tensor * sam_out;
|
||||||
// Building SAM
|
// Building SAM
|
||||||
|
|
@ -247,7 +246,7 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
sam_out = cur;
|
sam_out = cur;
|
||||||
}
|
}
|
||||||
//ggml_tensor * clip_out = build_dsocr_clip(sam_out);
|
|
||||||
ggml_tensor * clip_out;
|
ggml_tensor * clip_out;
|
||||||
// Building DS-OCR CLIP
|
// Building DS-OCR CLIP
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue