fixed missing changes from dev version

This commit is contained in:
Salvatore Rossitto 2026-03-12 13:28:07 +01:00
parent 3e4166d3fd
commit e18d20d6c6
4 changed files with 63 additions and 24 deletions

View File

@ -4774,7 +4774,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
} break;
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
return op->type == GGML_TYPE_F32
&& (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type))
&& op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD_ID:
return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
&& op->type == GGML_TYPE_F32

View File

@ -1,4 +1,5 @@
#include "out-prod.cuh"
#include "convert.cuh"
#include <cstdint>
#include <cstring>
@ -10,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type));
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
@ -24,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ne2 == src1->ne[2]);
GGML_ASSERT(ne3 == src1->ne[3]);
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
cublasHandle_t handle = ctx.cublas_handle();
// If src0 is quantized, dequantize to a temp F32 buffer on GPU
ggml_cuda_pool_alloc<float> src0_f32_alloc;
const float * src0_d;
int64_t lda;
if (src0->type != GGML_TYPE_F32) {
const int64_t n_elements = ggml_nelements(src0);
src0_f32_alloc.alloc(ctx.pool(), n_elements);
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type);
GGML_ASSERT(to_fp32 != nullptr);
to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream);
src0_d = src0_f32_alloc.ptr;
lda = ne00; // dequantized data is contiguous: stride = ne00
} else {
src0_d = (const float *) src0->data;
lda = nb01 / sizeof(float);
}
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
const float alpha = 1.0f;
const float beta = 0.0f;
CUBLAS_CHECK(cublasSetStream(handle, stream));
const int64_t lda = nb01 / sizeof(float);
const int64_t ldc = nb1 / sizeof(float);
const bool src1_T = ggml_is_transposed(src1);
@ -44,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
// data strides in dimensions 2/3
const size_t s02 = nb02 / sizeof(float);
const size_t s03 = nb03 / sizeof(float);
// data strides in dimensions 2/3 (for dequantized src0, use element-based strides)
const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float));
const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float));
const size_t s12 = nb12 / sizeof(float);
const size_t s13 = nb13 / sizeof(float);
const size_t s2 = nb2 / sizeof(float);

View File

@ -3871,12 +3871,17 @@ struct ggml_tensor * ggml_get_rows_back(
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c) {
GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
GGML_ASSERT(b->type == GGML_TYPE_I32);
GGML_ASSERT(a->ne[0] == c->ne[0]);
// Support both 2D and 3D: result shape matches c (the source tensor shape)
// TODO: implement non F32 return
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
struct ggml_tensor * result;
if (c->ne[2] > 1) {
result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]);
} else {
result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
}
result->op = GGML_OP_GET_ROWS_BACK;
result->src[0] = a;
@ -7077,9 +7082,12 @@ void ggml_build_backward_expand(
continue;
}
// inplace operations are currently not supported
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
// inplace operations are currently not supported — warn and skip instead of crashing
if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW &&
node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) {
GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op));
continue;
}
const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
GGML_ASSERT(ihash != GGML_HASHSET_FULL);

View File

@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
// do not load loras to extra buffer types (i.e. bufts for repacking)
// try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort
for (auto & ex : buft_extra) {
if (ex == buft) {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
// try to get the device's native (non-repack) buffer type
auto * dev = ggml_backend_buft_get_device(buft);
auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
if (native_buft && native_buft != buft) {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n",
__func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft));
buft = native_buft;
} else {
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n",
__func__, model_tensor->name, ggml_backend_buft_name(buft));
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
buft = ggml_backend_dev_buffer_type(cpu_dev);
}
buft = ggml_backend_dev_buffer_type(cpu_dev);
break;
}