fixed missing changes from dev version

2026-03-12 13:28:07 +01:00 · 2026-03-12 13:28:07 +01:00 · e18d20d6c6
parent 3e4166d3fd
commit e18d20d6c6
4 changed files with 63 additions and 24 deletions
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -4774,7 +4774,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                }
            } break;
        case GGML_OP_OUT_PROD:
-            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+            return op->type == GGML_TYPE_F32
+                && (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type))
+                && op->src[1]->type == GGML_TYPE_F32;
        case GGML_OP_OUT_PROD_ID:
            return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
                && op->type        == GGML_TYPE_F32
--- a/ggml/src/ggml-cuda/out-prod.cu
+++ b/ggml/src/ggml-cuda/out-prod.cu
@ -1,4 +1,5 @@
 #include "out-prod.cuh"
+#include "convert.cuh"

 #include <cstdint>
 #include <cstring>
@ -10,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    GGML_TENSOR_BINARY_OP_LOCALS

-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type));
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type  == GGML_TYPE_F32);

@ -24,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    GGML_ASSERT(ne2 == src1->ne[2]);
    GGML_ASSERT(ne3 == src1->ne[3]);

-    const float * src0_d = (const float *) src0->data;
-    const float * src1_d = (const float *) src1->data;
-    float       *  dst_d = (float       *)  dst->data;
-
    cudaStream_t   stream = ctx.stream();
    cublasHandle_t handle = ctx.cublas_handle();

+    // If src0 is quantized, dequantize to a temp F32 buffer on GPU
+    ggml_cuda_pool_alloc<float> src0_f32_alloc;
+    const float * src0_d;
+    int64_t       lda;
+
+    if (src0->type != GGML_TYPE_F32) {
+        const int64_t n_elements = ggml_nelements(src0);
+        src0_f32_alloc.alloc(ctx.pool(), n_elements);
+
+        to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type);
+        GGML_ASSERT(to_fp32 != nullptr);
+        to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream);
+
+        src0_d = src0_f32_alloc.ptr;
+        lda    = ne00; // dequantized data is contiguous: stride = ne00
+    } else {
+        src0_d = (const float *) src0->data;
+        lda    = nb01 / sizeof(float);
+    }
+
+    const float * src1_d = (const float *) src1->data;
+    float       *  dst_d = (float       *)  dst->data;
+
    const float alpha = 1.0f;
    const float beta = 0.0f;

    CUBLAS_CHECK(cublasSetStream(handle, stream));

-    const int64_t lda = nb01 / sizeof(float);
    const int64_t ldc = nb1  / sizeof(float);

    const bool src1_T = ggml_is_transposed(src1);
@ -44,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
    GGML_ASSERT(                             (src1_T ?        nb11 :        nb10) == sizeof(float));

-    // data strides in dimensions 2/3
-    const size_t s02 = nb02 / sizeof(float);
-    const size_t s03 = nb03 / sizeof(float);
+    // data strides in dimensions 2/3 (for dequantized src0, use element-based strides)
+    const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01)        : (nb02 / sizeof(float));
+    const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float));
    const size_t s12 = nb12 / sizeof(float);
    const size_t s13 = nb13 / sizeof(float);
    const size_t s2  = nb2  / sizeof(float);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3871,12 +3871,17 @@ struct ggml_tensor * ggml_get_rows_back(
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c) {
-    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[0] == c->ne[0]);

+    // Support both 2D and 3D: result shape matches c (the source tensor shape)
    // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+    struct ggml_tensor * result;
+    if (c->ne[2] > 1) {
+        result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]);
+    } else {
+        result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
+    }

    result->op     = GGML_OP_GET_ROWS_BACK;
    result->src[0] = a;
@ -7077,9 +7082,12 @@ void ggml_build_backward_expand(
            continue;
        }

-        // inplace operations are currently not supported
-        GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
-            node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
+        // inplace operations are currently not supported — warn and skip instead of crashing
+        if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW &&
+            node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) {
+            GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op));
+            continue;
+        }

        const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
        GGML_ASSERT(ihash != GGML_HASHSET_FULL);
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_

        auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);

-        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
+        // do not load loras to extra buffer types (i.e. bufts for repacking)
+        // try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort
        for (auto & ex : buft_extra) {
            if (ex == buft) {
-                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
-
-                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-                if (!cpu_dev) {
-                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                // try to get the device's native (non-repack) buffer type
+                auto * dev = ggml_backend_buft_get_device(buft);
+                auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
+                if (native_buft && native_buft != buft) {
+                    LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n",
+                        __func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft));
+                    buft = native_buft;
+                } else {
+                    LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n",
+                        __func__, model_tensor->name, ggml_backend_buft_name(buft));
+                    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                    if (!cpu_dev) {
+                        throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                    }
+                    buft = ggml_backend_dev_buffer_type(cpu_dev);
                }
-                buft = ggml_backend_dev_buffer_type(cpu_dev);

                break;
            }