From 5174d7206fc743efa4da0d02e8668903a3eb8429 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Fri, 13 Feb 2026 12:31:00 +0100 Subject: [PATCH 1/7] webui: UI and routing fixes (#19586) * chore: update webui build output * chore: update webui build output * fix: Scroll issues in DropdownMenuSearchable * webui: fix redirect to root ignoring base path * fix: Word wrapping * fix: remove obsolete modality UI tests causing CI failures - Remove VisionModality/AudioModality test stories - Remove mockServerProps usage and imports - Simplify Default test (remove dropdown interaction checks) - Simplify FileAttachments test (remove mocks) * feat: Improve formatting performance time --------- Co-authored-by: Pascal --- .../app/chat/ChatMessages/ChatMessage.svelte | 5 +- .../ChatMessages/ChatMessageStatistics.svelte | 15 ++-- .../app/misc/DropdownMenuSearchable.svelte | 88 +++++++++++++++++++ .../app/misc/MarkdownContent.svelte | 2 + .../server/webui/src/lib/utils/formatters.ts | 72 +++++++++++++++ .../tests/stories/ChatForm.stories.svelte | 67 -------------- 6 files changed, 173 insertions(+), 76 deletions(-) create mode 100644 tools/server/webui/src/lib/components/app/misc/DropdownMenuSearchable.svelte diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte index 82ef7de7c7..3470e2f711 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte @@ -1,5 +1,6 @@ + + + { + e.preventDefault(); + e.stopPropagation(); + }} + > + {@render trigger()} + + + +
+ +
+ +
+ {@render children()} + + {#if isEmpty} +
{emptyMessage}
+ {/if} +
+ + {#if footer} + + + {@render footer()} + {/if} +
+
diff --git a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte index cb3ae17a63..0084499f85 100644 --- a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte +++ b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte @@ -486,6 +486,8 @@ text-decoration: underline; text-underline-offset: 2px; transition: color 0.2s ease; + overflow-wrap: anywhere; + word-break: break-all; } div :global(a:hover) { diff --git a/tools/server/webui/src/lib/utils/formatters.ts b/tools/server/webui/src/lib/utils/formatters.ts index ae9f59a39c..bdf2ca26fd 100644 --- a/tools/server/webui/src/lib/utils/formatters.ts +++ b/tools/server/webui/src/lib/utils/formatters.ts @@ -51,3 +51,75 @@ export function formatNumber(num: number | unknown): string { return num.toLocaleString(); } + +/** + * Format JSON string with pretty printing (2-space indentation) + * Returns original string if parsing fails + * + * @param jsonString - JSON string to format + * @returns Pretty-printed JSON string or original if invalid + */ +export function formatJsonPretty(jsonString: string): string { + try { + const parsed = JSON.parse(jsonString); + return JSON.stringify(parsed, null, 2); + } catch { + return jsonString; + } +} + +/** + * Format time as HH:MM:SS in 24-hour format + * + * @param date - Date object to format + * @returns Formatted time string (HH:MM:SS) + */ +export function formatTime(date: Date): string { + return date.toLocaleTimeString('en-US', { + hour12: false, + hour: '2-digit', + minute: '2-digit', + second: '2-digit' + }); +} + +/** + * Formats milliseconds to a human-readable time string for performance metrics. + * Examples: "4h 12min 54s", "12min 34s", "45s", "0.5s" + * + * @param ms - Time in milliseconds + * @returns Formatted time string + */ +export function formatPerformanceTime(ms: number): string { + if (ms < 0) return '0s'; + + const totalSeconds = ms / 1000; + + if (totalSeconds < 1) { + return `${totalSeconds.toFixed(1)}s`; + } + + if (totalSeconds < 10) { + return `${totalSeconds.toFixed(1)}s`; + } + + const hours = Math.floor(totalSeconds / 3600); + const minutes = Math.floor((totalSeconds % 3600) / 60); + const seconds = Math.floor(totalSeconds % 60); + + const parts: string[] = []; + + if (hours > 0) { + parts.push(`${hours}h`); + } + + if (minutes > 0) { + parts.push(`${minutes}min`); + } + + if (seconds > 0 || parts.length === 0) { + parts.push(`${seconds}s`); + } + + return parts.join(' '); +} diff --git a/tools/server/webui/tests/stories/ChatForm.stories.svelte b/tools/server/webui/tests/stories/ChatForm.stories.svelte index 18319e8e61..a8a4c21b44 100644 --- a/tools/server/webui/tests/stories/ChatForm.stories.svelte +++ b/tools/server/webui/tests/stories/ChatForm.stories.svelte @@ -2,7 +2,6 @@ import { defineMeta } from '@storybook/addon-svelte-csf'; import ChatForm from '$lib/components/app/chat/ChatForm/ChatForm.svelte'; import { expect } from 'storybook/test'; - import { mockServerProps, mockConfigs } from './fixtures/storybook-mocks'; import jpgAsset from './fixtures/assets/1.jpg?url'; import svgAsset from './fixtures/assets/hf-logo.svg?url'; import pdfAsset from './fixtures/assets/example.pdf?raw'; @@ -46,8 +45,6 @@ name="Default" args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }} play={async ({ canvas, userEvent }) => { - mockServerProps(mockConfigs.noModalities); - const textarea = await canvas.findByRole('textbox'); const submitButton = await canvas.findByRole('button', { name: 'Send' }); @@ -66,73 +63,11 @@ const fileInput = document.querySelector('input[type="file"]'); await expect(fileInput).not.toHaveAttribute('accept'); - - // Open file attachments dropdown - const fileUploadButton = canvas.getByText('Attach files'); - await userEvent.click(fileUploadButton); - - // Check dropdown menu items are disabled (no modalities) - const imagesButton = document.querySelector('.images-button'); - const audioButton = document.querySelector('.audio-button'); - - await expect(imagesButton).toHaveAttribute('data-disabled'); - await expect(audioButton).toHaveAttribute('data-disabled'); - - // Close dropdown by pressing Escape - await userEvent.keyboard('{Escape}'); }} /> - { - mockServerProps(mockConfigs.visionOnly); - - // Open file attachments dropdown and verify it works - const fileUploadButton = canvas.getByText('Attach files'); - await userEvent.click(fileUploadButton); - - // Verify dropdown menu items exist - const imagesButton = document.querySelector('.images-button'); - const audioButton = document.querySelector('.audio-button'); - - await expect(imagesButton).toBeInTheDocument(); - await expect(audioButton).toBeInTheDocument(); - - // Close dropdown by pressing Escape - await userEvent.keyboard('{Escape}'); - - console.log('✅ Vision modality: Dropdown menu verified'); - }} -/> - - { - mockServerProps(mockConfigs.audioOnly); - - // Open file attachments dropdown and verify it works - const fileUploadButton = canvas.getByText('Attach files'); - await userEvent.click(fileUploadButton); - - // Verify dropdown menu items exist - const imagesButton = document.querySelector('.images-button'); - const audioButton = document.querySelector('.audio-button'); - - await expect(imagesButton).toBeInTheDocument(); - await expect(audioButton).toBeInTheDocument(); - - // Close dropdown by pressing Escape - await userEvent.keyboard('{Escape}'); - - console.log('✅ Audio modality: Dropdown menu verified'); - }} -/> - { - mockServerProps(mockConfigs.bothModalities); - const jpgAttachment = canvas.getByAltText('1.jpg'); const svgAttachment = canvas.getByAltText('hf-logo.svg'); const pdfFileExtension = canvas.getByText('PDF'); From 5065da554e8e78e7f2dff35dede259c8dd56c9f3 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 13 Feb 2026 17:01:40 +0530 Subject: [PATCH 2/7] CUDA: loop over ne2*ne3 in case it overflows (#19538) * CUDA: loop over ne2*ne3 in case it overflows * use fastdiv --- ggml/src/ggml-cuda/convert.cu | 62 +++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index ba3d4eeb88..09b6d5db6a 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -7,7 +7,8 @@ template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, - const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t ne00, const int64_t ne01, + const int64_t ne0203, const uint3 ne02, const int64_t s01, const int64_t s02, const int64_t s03) { const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x); @@ -16,23 +17,27 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ } const int64_t i01 = blockIdx.y; - const int64_t i02 = blockIdx.z % ne02; - const int64_t i03 = blockIdx.z / ne02; - const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01; + for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) { + const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02); + const int64_t i02 = dm.y; + const int64_t i03 = dm.x; - const int64_t ib = ibx0 + i00/qk; // block index - const int64_t iqs = (i00%qk)/qr; // quant index - const int64_t iybs = i00 - i00%qk; // y block start index - const int64_t y_offset = qr == 1 ? 1 : qk/2; + const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01; - // dequantize - float2 v; - dequantize_kernel(vx, ib, iqs, v); + const int64_t ib = ibx0 + i00/qk; // block index + const int64_t iqs = (i00%qk)/qr; // quant index + const int64_t iybs = i00 - i00%qk; // y block start index + const int64_t y_offset = qr == 1 ? 1 : qk/2; - const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs; - y[iy0 + 0] = ggml_cuda_cast(v.x); - y[iy0 + y_offset] = ggml_cuda_cast(v.y); + // dequantize + float2 v; + dequantize_kernel(vx, ib, iqs, v); + + const int64_t iy0 = (i0203*ne01 + i01)*ne00 + iybs + iqs; + y[iy0 + 0] = ggml_cuda_cast(v.x); + y[iy0 + y_offset] = ggml_cuda_cast(v.y); + } } template @@ -485,9 +490,11 @@ template static void dequantize_block_cuda(const void * vx, dst_t * y, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { - const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03); + const int64_t ne0203 = ne02*ne03; + const uint3 ne02_fdv = init_fastdiv_values(ne02); + const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, (int)std::min(ne0203, (int64_t)65535)); dequantize_block<<>> - (vx, y, ne00, ne01, ne02, s01, s02, s03); + (vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03); } template @@ -612,7 +619,8 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t template static __global__ void convert_unary( - const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, + const int64_t ne0203, const uint3 ne02, const int64_t s01, const int64_t s02, const int64_t s03) { const int64_t i00 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; @@ -621,23 +629,29 @@ static __global__ void convert_unary( } const int64_t i01 = blockIdx.y; - const int64_t i02 = blockIdx.z % ne02; - const int64_t i03 = blockIdx.z / ne02; const src_t * x = (const src_t *) vx; - const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; - const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00; - y[iy] = ggml_cuda_cast(x[ix]); + for (int64_t i0203 = blockIdx.z; i0203 < ne0203; i0203 += gridDim.z) { + const uint2 dm = fast_div_modulo((uint32_t)i0203, ne02); + const int64_t i02 = dm.y; + const int64_t i03 = dm.x; + + const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00; + const int64_t iy = (i0203*ne01 + i01)*ne00 + i00; + y[iy] = ggml_cuda_cast(x[ix]); + } } template static void convert_unary_cuda(const void * vx, dst_t * y, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) { - const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, ne02*ne03); + const int64_t ne0203 = ne02*ne03; + const uint3 ne02_fdv = init_fastdiv_values(ne02); + const dim3 num_blocks((ne00 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE, ne01, (int)std::min(ne0203, (int64_t)65535)); convert_unary<<>> - (vx, y, ne00, ne01, ne02, s01, s02, s03); + (vx, y, ne00, ne01, ne0203, ne02_fdv, s01, s02, s03); } template From b2ecc0cdb4cf724b78c770cbbacb8029168d26fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 13 Feb 2026 12:49:10 +0100 Subject: [PATCH 3/7] support --verbose-prompt (#19576) --- tools/cli/cli.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 02ccb72598..ad421e6326 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -52,6 +52,7 @@ struct cli_context { json messages = json::array(); std::vector input_files; task_params defaults; + bool verbose_prompt; // thread for showing "loading" animation std::atomic loading_show; @@ -66,6 +67,8 @@ struct cli_context { defaults.stream = true; // make sure we always use streaming mode defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // defaults.return_progress = true; // TODO: show progress + + verbose_prompt = params.verbose_prompt; } std::string generate_completion(result_timings & out_timings) { @@ -91,6 +94,12 @@ struct cli_context { rd.post_task({std::move(task)}); } + if (verbose_prompt) { + console::set_display(DISPLAY_TYPE_PROMPT); + console::log("%s\n\n", chat_params.prompt.c_str()); + console::set_display(DISPLAY_TYPE_RESET); + } + // wait for first result console::spinner::start(); server_task_result_ptr result = rd.next(should_stop); From 0e219914723695e5e0a1d470c6dbe44e304a8270 Mon Sep 17 00:00:00 2001 From: ymcki <84055651+ymcki@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:31:37 +0800 Subject: [PATCH 4/7] fix vulkan ggml_acc only works in 3d but not 4d (#19426) * fix vulkan ggml_acc only works in 3d but not 4d * removed clamp in test_acc_block * use the correct stride and its test case * cuda : fix "supports op" condition * change src0 to src1 in ggml_vk_acc. Update acc.comp with jeffbolznv\'s suggestion except to keep the boundary check * version without boundary check * revert back to boundary check version --------- Co-authored-by: Georgi Gerganov --- ggml/src/ggml-cuda/ggml-cuda.cu | 5 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 ++--- ggml/src/ggml-vulkan/vulkan-shaders/acc.comp | 17 +++---- tests/test-backend-ops.cpp | 49 ++++++++++++++++---- 4 files changed, 60 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 7dc688483a..85ce96958f 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4822,8 +4822,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_POOL_2D: - case GGML_OP_ACC: return true; + case GGML_OP_ACC: + // TODO: extend support like so: + //return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]); + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); case GGML_OP_SUM: return ggml_is_contiguous_rows(op->src[0]); case GGML_OP_TOP_K: diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 72097ffd0f..e5dcd3cbda 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -9801,16 +9801,16 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes + int nb1 = dst->op_params[0] / src0_type_size; // 4 bytes of float32 + int nb2 = dst->op_params[1] / src0_type_size; // 4 bytes of float32 + int nb3 = dst->op_params[2] / src0_type_size; // 4 bytes of float32 + int offset = dst->op_params[3] / src0_type_size; // offset in bytes ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ACC, { (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)nb3, 0, 0.0f, 0.0f, offset, }); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp index 5084a70ed4..3d61168b56 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp @@ -13,17 +13,18 @@ void main() { const uint offset = p.param3; const uint src1_i = idx - offset; - const uint oz = src1_i / p.nb02; - const uint oy = (src1_i - (oz * p.nb02)) / p.nb01; - const uint ox = src1_i % p.nb01; + const uint i3 = src1_i / p.nb03; + const uint rem2 = src1_i - i3 * p.nb03; + const uint i2 = rem2 / p.nb02; + const uint rem1 = rem2 - i2 * p.nb02; + const uint i1 = rem1 / p.nb01; + const uint i0 = rem1 % p.nb01; uint i00, i01, i02, i03; - get_indices(idx, i00, i01, i02, i03); - if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) { - data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11])); + if (i0 < p.ne10 && i1 < p.ne11 && i2 < p.ne12 && i3 < p.ne13) { + data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i0, i1, i2, i3)])); } else { - data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)])); + data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx])); } } - diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8816f6963f..a50c569b82 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5847,26 +5847,46 @@ struct test_acc : public test_case { const ggml_type type; const std::array ne_a; const std::array ne_b; + const int64_t stride_dim; std::string vars() override { - return VARS_TO_STR3(type, ne_a, ne_b); + return VARS_TO_STR4(type, ne_a, ne_b, stride_dim); } test_acc(ggml_type type = GGML_TYPE_F32, - std::array ne_a = {256, 17, 1, 1}, - std::array ne_b = {256, 16, 1, 1}) - : type(type), ne_a(ne_a), ne_b(ne_b) {} + std::array ne_a = {256, 17, 2, 3}, + std::array ne_b = {256, 16, 2, 3}, + uint64_t stride_dim = -1) + : type(type), ne_a(ne_a), ne_b(ne_b), stride_dim(stride_dim) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); ggml_set_param(a); ggml_set_name(a, "a"); - ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data()); - ggml_set_param(b); + ggml_tensor * b; + if (stride_dim == 1 || stride_dim == 2 || stride_dim == 3) { + // Create a larger tensor and take a view at a non-zero offset. + // This tests that the backend correctly handles b's data offset + std::array ne_b_pad = {ne_b[0], ne_b[1], ne_b[2], ne_b[3]}; + ne_b_pad[stride_dim] += 1; + ggml_tensor * b_pad = ggml_new_tensor(ctx, type, 4, ne_b_pad.data()); + ggml_set_param(b_pad); + ggml_set_name(b_pad, "b_pad"); + // View that skips the first row, so b has a non-zero byte offset + b = ggml_view_4d(ctx, b_pad, + ne_b[0], ne_b[1], ne_b[2], ne_b[3], + b_pad->nb[1], b_pad->nb[2], b_pad->nb[3], + b_pad->nb[1]); + } else { + b = ggml_new_tensor(ctx, type, 4, ne_b.data()); + ggml_set_param(b); + } ggml_set_name(b, "b"); - ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]); + // When ne_b[0] < ne_a[0], a->nb[1] != b->nb[1], so the stride + // parameters to ggml_acc don't match b's natural stride. + ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], 0); ggml_set_name(out, "out"); return out; @@ -8170,7 +8190,12 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {64, 64, 320, 1})); test_cases.emplace_back(new test_group_norm_mul_add(GGML_TYPE_F32, {9, 9, 1280, 1})); - test_cases.emplace_back(new test_acc()); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 1, 1}, {256, 16, 1, 1}, -1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, -1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, -1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3)); test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular test_cases.emplace_back(new test_pad_ext()); @@ -8605,6 +8630,14 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 512, 1)); // prefill test_cases.emplace_back(new test_ssm_scan(GGML_TYPE_F32, 128, 64, 48, 1, 1, 1)); // generate + // acc + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 1, 1}, {256, 16, 1, 1}, -1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, -1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, -1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2)); + test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3)); + return test_cases; } From cc2aa81513ff8bbf8f0527ce36b9366971d5fd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= <1478977+Alcpz@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:32:14 +0000 Subject: [PATCH 5/7] Fix wrong memcpy length for block_interleave == 4 (#19575) --- ggml/src/ggml-cpu/repack.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 4cb7cdeb07..f94426ddd7 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1916,9 +1916,10 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in int src_offset = (i / 8) * blck_size_interleave; int dst_offset = i * blck_size_interleave; + // buffer large enough for the max interleave block size (8 bytes) uint64_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); + memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave); + memcpy(&out.qs[dst_offset], &elems, blck_size_interleave); } // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K From 752584d5f5f30bff8f24a765f858645a9ef7d5d9 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 13 Feb 2026 14:56:53 +0100 Subject: [PATCH 6/7] model: support GLM MoE DSA arch (NOTE: indexer is not yet supported) (#19460) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * model: support GLM MoE DSA arch * working version * pyright * keep indexer tensors * add indexer gguf params * loaded now * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * update * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * minor fix and cleanup --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 92 +++++++++++--------- gguf-py/gguf/constants.py | 56 ++++++++++++ gguf-py/gguf/gguf_writer.py | 9 ++ gguf-py/gguf/tensor_mapping.py | 16 ++++ src/llama-arch.cpp | 52 ++++++++++++ src/llama-arch.h | 8 ++ src/llama-hparams.h | 5 ++ src/llama-model.cpp | 151 ++++++++++++++++++++++++++++++++- src/llama-model.h | 8 ++ src/models/deepseek2.cpp | 5 +- 10 files changed, 361 insertions(+), 41 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2afaf85fb8..825080b588 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1608,6 +1608,23 @@ class TextModel(ModelBase): special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_glm(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + # Special tokens + # Note: Using <|endoftext|> (151329) for eot causes endless generation + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 + special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_interns1(self): tokens: list[str] = [] toktypes: list[int] = [] @@ -7710,6 +7727,9 @@ class DeepseekModel(TextModel): class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + # TODO @ngxson : remove this when we support MTP for deepseek models + skip_mtp = True + def set_vocab(self): try: self._set_vocab_gpt2() @@ -7841,10 +7861,11 @@ class DeepseekV2Model(TextModel): name = name.replace("e_score_correction_bias", "e_score_correction.bias") # skip Multi-Token Prediction (MTP) layers - block_count = self.hparams["num_hidden_layers"] - match = re.match(r"model.layers.(\d+)", name) - if match and int(match.group(1)) >= block_count: - return + if self.skip_mtp: + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return # process the experts separately if name.find("mlp.experts") != -1: @@ -8684,24 +8705,7 @@ class Glm4MoeModel(TextModel): self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) def set_vocab(self): - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - # Special tokens - # Note: Using <|endoftext|> (151329) for eot causes endless generation - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 - special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 - - special_vocab.add_to_gguf(self.gguf_writer) + return self._set_vocab_glm() def set_gguf_parameters(self): super().set_gguf_parameters() @@ -8801,26 +8805,38 @@ class Glm4MoeModel(TextModel): class Glm4MoeLiteModel(DeepseekV2Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 - # copied from Glm4MoeModel def set_vocab(self): - from transformers import AutoTokenizer + return self._set_vocab_glm() - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - tokens, toktypes, tokpre = self.get_vocab_base() - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - # Special tokens - # Note: Using <|endoftext|> (151329) for eot causes endless generation - special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 - special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 - special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 - special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 +@ModelBase.register("GlmMoeDsaForCausalLM") +class GlmMoeDsaModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.GLM_DSA + skip_mtp = False - special_vocab.add_to_gguf(self.gguf_writer) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + return self._set_vocab_glm() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_dim = self.hparams["qk_rope_head_dim"] + partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) + + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + # DSA indexer parameters + self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"]) + self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"]) + self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"]) @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9dab0df08a..eb8770af06 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -181,6 +181,11 @@ class Keys: SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" + class Indexer: + HEAD_COUNT = "{arch}.attention.indexer.head_count" + KEY_LENGTH = "{arch}.attention.indexer.key_length" + TOP_K = "{arch}.attention.indexer.top_k" + class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" @@ -425,6 +430,7 @@ class MODEL_ARCH(IntEnum): CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() + GLM_DSA = auto() BITNET = auto() T5 = auto() T5ENCODER = auto() @@ -670,6 +676,10 @@ class MODEL_TENSOR(IntEnum): VISEXP_GATE = auto() VISEXP_DOWN = auto() VISEXP_UP = auto() + INDEXER_K_NORM = auto() + INDEXER_PROJ = auto() + INDEXER_ATTN_K = auto() + INDEXER_ATTN_Q_B = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() @@ -858,6 +868,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", + MODEL_ARCH.GLM_DSA: "glm-dsa", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", @@ -1101,6 +1112,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up", + MODEL_TENSOR.INDEXER_K_NORM: "blk.{bid}.indexer.k_norm", + MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj", + MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k", + MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b", # vision MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", @@ -2677,6 +2692,47 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], + MODEL_ARCH.GLM_DSA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.INDEXER_K_NORM, + MODEL_TENSOR.INDEXER_PROJ, + MODEL_TENSOR.INDEXER_ATTN_K, + MODEL_TENSOR.INDEXER_ATTN_Q_B, + # NextN/MTP tensors - preserved but unused + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, + ], MODEL_ARCH.BITNET: [ MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a237537c8d..4245d18bc4 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -771,6 +771,15 @@ class GGUFWriter: def add_value_length_mla(self, length: int) -> None: self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length) + def add_indexer_head_count(self, count: int) -> None: + self.add_uint32(Keys.Attention.Indexer.HEAD_COUNT.format(arch=self.arch), count) + + def add_indexer_key_length(self, length: int) -> None: + self.add_uint32(Keys.Attention.Indexer.KEY_LENGTH.format(arch=self.arch), length) + + def add_indexer_top_k(self, top_k: int) -> None: + self.add_uint32(Keys.Attention.Indexer.TOP_K.format(arch=self.arch), top_k) + def add_max_alibi_bias(self, bias: float) -> None: self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 43647904b4..c1538b3ff3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1206,6 +1206,22 @@ class TensorNameMap: "model.layers.{bid}.self_attn.vision_expert_query_key_value", # cogvlm ), + MODEL_TENSOR.INDEXER_K_NORM: ( + "model.layers.{bid}.self_attn.indexer.k_norm", # DSA + ), + + MODEL_TENSOR.INDEXER_PROJ: ( + "model.layers.{bid}.self_attn.indexer.weights_proj", # DSA + ), + + MODEL_TENSOR.INDEXER_ATTN_K: ( + "model.layers.{bid}.self_attn.indexer.wk", # DSA + ), + + MODEL_TENSOR.INDEXER_ATTN_Q_B: ( + "model.layers.{bid}.self_attn.indexer.wq_b", # DSA + ), + ############################################################################ # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a943d40dc4..416c17463e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -74,6 +74,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, + { LLM_ARCH_GLM_DSA, "glm-dsa" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, @@ -225,6 +226,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, + { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" }, + { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, + { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, @@ -516,6 +520,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, + { LLM_TENSOR_INDEXER_K_NORM, "blk.%d.indexer.k_norm" }, + { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, + { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, + { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, }; static std::set llm_get_tensor_names(llm_arch arch) { @@ -1657,6 +1665,46 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, }; + case LLM_ARCH_GLM_DSA: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q_A_NORM, + LLM_TENSOR_ATTN_KV_A_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_A, + LLM_TENSOR_ATTN_Q_B, + LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, + LLM_TENSOR_INDEXER_K_NORM, + LLM_TENSOR_INDEXER_PROJ, + LLM_TENSOR_INDEXER_ATTN_K, + LLM_TENSOR_INDEXER_ATTN_Q_B, + LLM_TENSOR_NEXTN_EH_PROJ, + LLM_TENSOR_NEXTN_EMBED_TOKENS, + LLM_TENSOR_NEXTN_ENORM, + LLM_TENSOR_NEXTN_HNORM, + LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, + LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + }; case LLM_ARCH_BITNET: return { LLM_TENSOR_TOKEN_EMBD, @@ -2643,6 +2691,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_INDEXER_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are currently ignored (reserved for future MTP support) // These tensors only exist in the last layer(s) and are treated as output tensors {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 4f7b51e70d..521944370b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -78,6 +78,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, + LLM_ARCH_GLM_DSA, LLM_ARCH_BITNET, LLM_ARCH_T5, LLM_ARCH_T5ENCODER, @@ -229,6 +230,9 @@ enum llm_kv { LLM_KV_ATTENTION_TEMPERATURE_SCALE, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, + LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, + LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, + LLM_KV_ATTENTION_INDEXER_TOP_K, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, @@ -517,6 +521,10 @@ enum llm_tensor { LLM_TENSOR_VISEXP_FFN_GATE, LLM_TENSOR_VISEXP_FFN_DOWN, LLM_TENSOR_VISEXP_FFN_UP, + LLM_TENSOR_INDEXER_K_NORM, + LLM_TENSOR_INDEXER_PROJ, + LLM_TENSOR_INDEXER_ATTN_K, + LLM_TENSOR_INDEXER_ATTN_Q_B, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 706eda8441..c4b2a99da5 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -193,6 +193,11 @@ struct llama_hparams { std::array xielu_beta; std::array xielu_eps; + // DSA (deepseek sparse attention) + uint32_t indexer_n_head = 0; + uint32_t indexer_head_size = 0; + uint32_t indexer_top_k = 0; + // qwen3vl deepstack uint32_t n_deepstack_layers = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6b7da69e9d..c26584aa67 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -137,6 +137,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_300B_A47B: return "300B.A47B"; case LLM_TYPE_310B_A15B: return "310B.A15B"; case LLM_TYPE_355B_A32B: return "355B.A32B"; + case LLM_TYPE_744B_A40B: return "744B.A40B"; case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E4B: return "E4B"; default: return "?B"; @@ -1822,6 +1823,50 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GLM_DSA: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // MoE parameters + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + + // deepseek MLA parameters + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + + // DSA parameters + ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); + ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); + ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); + + // Expert gating function (GLM-4.5 uses sigmoid) + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + } + + // NextN/MTP parameters + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + + switch (hparams.n_layer) { + case 79: type = LLM_TYPE_744B_A40B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_BITNET: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5492,6 +5537,108 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_GLM_DSA: + { + const bool is_mla = hparams.is_mla(); + if (!is_mla) { + throw std::runtime_error("GLM_DSA architecture requires MLA"); + } + + // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); + + const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + // try to load output.weight, if not found, use token_embd (tied embeddings) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (!output) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + int flags = 0; + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // skip all tensors in the NextN layers + // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later + flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; + } + + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags); + layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags); + + layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags); + + layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags); + + // note: only old legacy GGUF files will have the unsplit wkv_b tensor in + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); + + // DSA indexer + layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags); + layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags); + layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags); + layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags); + layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags); + if (i < (int) hparams.n_layer_dense_lead) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); + } else { + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0"); + } + + // MoE branch + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + + // Shared expert branch + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); + } + + // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + + // Optional tensors + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED); + } + } + } break; case LLM_ARCH_NEMOTRON: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7765,7 +7912,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); } - if (arch == LLM_ARCH_DEEPSEEK2) { + if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); @@ -8337,6 +8484,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_GLM_DSA: { llm = std::make_unique(*this, params); } break; @@ -8738,6 +8886,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_MISTRAL3: case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_MAINCODER: + case LLM_ARCH_GLM_DSA: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/src/llama-model.h b/src/llama-model.h index adc8ff6479..b350591429 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -130,6 +130,7 @@ enum llm_type { LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_310B_A15B, // /MiMo-V2-Flash LLM_TYPE_355B_A32B, // GLM-4.5 + LLM_TYPE_744B_A40B, // GLM-5 LLM_TYPE_E2B, LLM_TYPE_E4B, }; @@ -429,6 +430,13 @@ struct llama_layer { struct ggml_tensor * ssm_g_b = nullptr; struct ggml_tensor * ssm_o_norm = nullptr; + // DSA (deepseek sparse attention) + struct ggml_tensor * indexer_k_norm = nullptr; + struct ggml_tensor * indexer_k_norm_b = nullptr; + struct ggml_tensor * indexer_proj = nullptr; + struct ggml_tensor * indexer_attn_k = nullptr; + struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 987f449934..b2c1f16060 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -45,7 +45,8 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; + for (int il = 0; il < effective_n_layers; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -188,7 +189,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } - if (il == n_layer - 1 && inp_out_ids) { + if (il == effective_n_layers - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } From b48e80f677ce3518d0ac5859cc10ceeefd033898 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 13 Feb 2026 15:10:46 +0100 Subject: [PATCH 7/7] common : update download code (#19573) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * common : remove legacy .json to .etag migration code Signed-off-by: Adrien Gallouët * common : simplify common_download_file_single_online This commit also force a redownload if the file exists but has no .etag file. Signed-off-by: Adrien Gallouët --------- Signed-off-by: Adrien Gallouët --- common/download.cpp | 181 ++++++++++++++++++-------------------------- 1 file changed, 74 insertions(+), 107 deletions(-) diff --git a/common/download.cpp b/common/download.cpp index 8710438aa4..17f930f5ac 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -114,44 +114,18 @@ static void write_etag(const std::string & path, const std::string & etag) { } static std::string read_etag(const std::string & path) { - std::string none; const std::string etag_path = path + ".etag"; - - if (std::filesystem::exists(etag_path)) { - std::ifstream etag_in(etag_path); - if (!etag_in) { - LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str()); - return none; - } - std::string etag; - std::getline(etag_in, etag); - return etag; + if (!std::filesystem::exists(etag_path)) { + return {}; } - - // no etag file, but maybe there is an old .json - // remove this code later - const std::string metadata_path = path + ".json"; - - if (std::filesystem::exists(metadata_path)) { - std::ifstream metadata_in(metadata_path); - try { - nlohmann::json metadata_json; - metadata_in >> metadata_json; - LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), - metadata_json.dump().c_str()); - if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) { - std::string etag = metadata_json.at("etag"); - write_etag(path, etag); - if (!std::filesystem::remove(metadata_path)) { - LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str()); - } - return etag; - } - } catch (const nlohmann::json::exception & e) { - LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); - } + std::ifstream etag_in(etag_path); + if (!etag_in) { + LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str()); + return {}; } - return none; + std::string etag; + std::getline(etag_in, etag); + return etag; } static bool is_http_status_ok(int status) { @@ -347,62 +321,64 @@ static int common_download_file_single_online(const std::string & url, LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); } - for (int i = 0; i < max_attempts; ++i) { - auto head = cli.Head(parts.path); - bool head_ok = head && head->status >= 200 && head->status < 300; - if (!head_ok) { - LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1); - if (file_exists) { - LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str()); - return 304; // 304 Not Modified - fake cached response - } - return head->status; // cannot use cached file, return raw status code - // TODO: maybe retry only on certain codes - } - - std::string etag; - if (head_ok && head->has_header("ETag")) { - etag = head->get_header_value("ETag"); - } - - size_t total_size = 0; - if (head_ok && head->has_header("Content-Length")) { - try { - total_size = std::stoull(head->get_header_value("Content-Length")); - } catch (const std::exception& e) { - LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what()); - } - } - - bool supports_ranges = false; - if (head_ok && head->has_header("Accept-Ranges")) { - supports_ranges = head->get_header_value("Accept-Ranges") != "none"; - } - - bool should_download_from_scratch = false; - if (!last_etag.empty() && !etag.empty() && last_etag != etag) { - LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, - last_etag.c_str(), etag.c_str()); - should_download_from_scratch = true; - } - + auto head = cli.Head(parts.path); + if (!head || head->status < 200 || head->status >= 300) { + LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1); if (file_exists) { - if (!should_download_from_scratch) { - LOG_INF("%s: using cached file: %s\n", __func__, path.c_str()); - return 304; // 304 Not Modified - fake cached response - } - LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); - if (remove(path.c_str()) != 0) { - LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); - return -1; - } + LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str()); + return 304; // 304 Not Modified - fake cached response + } + return head ? head->status : -1; + } + + std::string etag; + if (head->has_header("ETag")) { + etag = head->get_header_value("ETag"); + } + + size_t total_size = 0; + if (head->has_header("Content-Length")) { + try { + total_size = std::stoull(head->get_header_value("Content-Length")); + } catch (const std::exception& e) { + LOG_WRN("%s: invalid Content-Length in HEAD response: %s\n", __func__, e.what()); + } + } + + bool supports_ranges = false; + if (head->has_header("Accept-Ranges")) { + supports_ranges = head->get_header_value("Accept-Ranges") != "none"; + } + + if (file_exists) { + if (etag.empty()) { + LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str()); + return 304; // 304 Not Modified - fake cached response + } + if (!last_etag.empty() && last_etag == etag) { + LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str()); + return 304; // 304 Not Modified - fake cached response + } + if (remove(path.c_str()) != 0) { + LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); + return -1; + } + } + + const std::string path_temporary = path + ".downloadInProgress"; + int delay = retry_delay_seconds; + + for (int i = 0; i < max_attempts; ++i) { + if (i) { + LOG_WRN("%s: retrying after %d seconds...\n", __func__, delay); + std::this_thread::sleep_for(std::chrono::seconds(delay)); + delay *= retry_delay_seconds; } - const std::string path_temporary = path + ".downloadInProgress"; size_t existing_size = 0; if (std::filesystem::exists(path_temporary)) { - if (supports_ranges && !should_download_from_scratch) { + if (supports_ranges) { existing_size = std::filesystem::file_size(path_temporary); } else if (remove(path_temporary.c_str()) != 0) { LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str()); @@ -410,32 +386,23 @@ static int common_download_file_single_online(const std::string & url, } } - // start the download - LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n", - __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str()); - const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size); - if (!was_pull_successful) { - if (i + 1 < max_attempts) { - const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000; - LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay); - std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); - } else { - LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts); + LOG_INF("%s: downloading from %s to %s (etag:%s)...\n", + __func__, common_http_show_masked_url(parts).c_str(), + path_temporary.c_str(), etag.c_str()); + + if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size)) { + if (std::rename(path_temporary.c_str(), path.c_str()) != 0) { + LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + return -1; } - continue; + if (!etag.empty()) { + write_etag(path, etag); + } + return head->status; } - - if (std::rename(path_temporary.c_str(), path.c_str()) != 0) { - LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); - return -1; - } - if (!etag.empty()) { - write_etag(path, etag); - } - - return head->status; // TODO: use actual GET status? } + LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts); return -1; // max attempts reached }