From e18d20d6c68464c749c6fed059dd3a5449810577 Mon Sep 17 00:00:00 2001 From: Salvatore Rossitto Date: Thu, 12 Mar 2026 13:28:07 +0100 Subject: [PATCH] fixed missing changes from dev version --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 +++- ggml/src/ggml-cuda/out-prod.cu | 37 +++++++++++++++++++++++++-------- ggml/src/ggml.c | 22 +++++++++++++------- src/llama-adapter.cpp | 24 ++++++++++++++------- 4 files changed, 63 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 3e1cca6c98..9e5492f5fc 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4774,7 +4774,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } } break; case GGML_OP_OUT_PROD: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + return op->type == GGML_TYPE_F32 + && (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) + && op->src[1]->type == GGML_TYPE_F32; case GGML_OP_OUT_PROD_ID: return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr && op->type == GGML_TYPE_F32 diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index 9afc323bd9..392de34d1c 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -1,4 +1,5 @@ #include "out-prod.cuh" +#include "convert.cuh" #include #include @@ -10,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -24,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ne2 == src1->ne[2]); GGML_ASSERT(ne3 == src1->ne[3]); - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - float * dst_d = (float *) dst->data; - cudaStream_t stream = ctx.stream(); cublasHandle_t handle = ctx.cublas_handle(); + // If src0 is quantized, dequantize to a temp F32 buffer on GPU + ggml_cuda_pool_alloc src0_f32_alloc; + const float * src0_d; + int64_t lda; + + if (src0->type != GGML_TYPE_F32) { + const int64_t n_elements = ggml_nelements(src0); + src0_f32_alloc.alloc(ctx.pool(), n_elements); + + to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type); + GGML_ASSERT(to_fp32 != nullptr); + to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream); + + src0_d = src0_f32_alloc.ptr; + lda = ne00; // dequantized data is contiguous: stride = ne00 + } else { + src0_d = (const float *) src0->data; + lda = nb01 / sizeof(float); + } + + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + const float alpha = 1.0f; const float beta = 0.0f; CUBLAS_CHECK(cublasSetStream(handle, stream)); - const int64_t lda = nb01 / sizeof(float); const int64_t ldc = nb1 / sizeof(float); const bool src1_T = ggml_is_transposed(src1); @@ -44,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float)); - // data strides in dimensions 2/3 - const size_t s02 = nb02 / sizeof(float); - const size_t s03 = nb03 / sizeof(float); + // data strides in dimensions 2/3 (for dequantized src0, use element-based strides) + const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float)); + const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float)); const size_t s12 = nb12 / sizeof(float); const size_t s13 = nb13 / sizeof(float); const size_t s2 = nb2 / sizeof(float); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f146f13b5f..255e7d5a88 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3871,12 +3871,17 @@ struct ggml_tensor * ggml_get_rows_back( struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); + GGML_ASSERT(b->type == GGML_TYPE_I32); + GGML_ASSERT(a->ne[0] == c->ne[0]); + // Support both 2D and 3D: result shape matches c (the source tensor shape) // TODO: implement non F32 return - //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + struct ggml_tensor * result; + if (c->ne[2] > 1) { + result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]); + } else { + result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + } result->op = GGML_OP_GET_ROWS_BACK; result->src[0] = a; @@ -7077,9 +7082,12 @@ void ggml_build_backward_expand( continue; } - // inplace operations are currently not supported - GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW || - node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE); + // inplace operations are currently not supported — warn and skip instead of crashing + if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW && + node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) { + GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op)); + continue; + } const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node); GGML_ASSERT(ihash != GGML_HASHSET_FULL); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d6a5800e63..1fee93ba40 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer); - // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case + // do not load loras to extra buffer types (i.e. bufts for repacking) + // try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort for (auto & ex : buft_extra) { if (ex == buft) { - LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft)); - - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (!cpu_dev) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); + // try to get the device's native (non-repack) buffer type + auto * dev = ggml_backend_buft_get_device(buft); + auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr; + if (native_buft && native_buft != buft) { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n", + __func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft)); + buft = native_buft; + } else { + LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", + __func__, model_tensor->name, ggml_backend_buft_name(buft)); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } + buft = ggml_backend_dev_buffer_type(cpu_dev); } - buft = ggml_backend_dev_buffer_type(cpu_dev); break; }