fixed missing changes from dev version
This commit is contained in:
parent
3e4166d3fd
commit
e18d20d6c6
|
|
@ -4774,7 +4774,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
}
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
return op->type == GGML_TYPE_F32
|
||||
&& (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type))
|
||||
&& op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_OUT_PROD_ID:
|
||||
return op->src[0] != nullptr && op->src[1] != nullptr && op->src[2] != nullptr
|
||||
&& op->type == GGML_TYPE_F32
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "out-prod.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
|
@ -10,7 +11,7 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type));
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
|
|
@ -24,19 +25,37 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
GGML_ASSERT(ne2 == src1->ne[2]);
|
||||
GGML_ASSERT(ne3 == src1->ne[3]);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
cublasHandle_t handle = ctx.cublas_handle();
|
||||
|
||||
// If src0 is quantized, dequantize to a temp F32 buffer on GPU
|
||||
ggml_cuda_pool_alloc<float> src0_f32_alloc;
|
||||
const float * src0_d;
|
||||
int64_t lda;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
const int64_t n_elements = ggml_nelements(src0);
|
||||
src0_f32_alloc.alloc(ctx.pool(), n_elements);
|
||||
|
||||
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(src0->type);
|
||||
GGML_ASSERT(to_fp32 != nullptr);
|
||||
to_fp32(src0->data, src0_f32_alloc.ptr, n_elements, stream);
|
||||
|
||||
src0_d = src0_f32_alloc.ptr;
|
||||
lda = ne00; // dequantized data is contiguous: stride = ne00
|
||||
} else {
|
||||
src0_d = (const float *) src0->data;
|
||||
lda = nb01 / sizeof(float);
|
||||
}
|
||||
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(handle, stream));
|
||||
|
||||
const int64_t lda = nb01 / sizeof(float);
|
||||
const int64_t ldc = nb1 / sizeof(float);
|
||||
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
|
|
@ -44,9 +63,9 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
|
||||
|
||||
// data strides in dimensions 2/3
|
||||
const size_t s02 = nb02 / sizeof(float);
|
||||
const size_t s03 = nb03 / sizeof(float);
|
||||
// data strides in dimensions 2/3 (for dequantized src0, use element-based strides)
|
||||
const size_t s02 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01) : (nb02 / sizeof(float));
|
||||
const size_t s03 = (src0->type != GGML_TYPE_F32) ? (ne00 * ne01 * ne02) : (nb03 / sizeof(float));
|
||||
const size_t s12 = nb12 / sizeof(float);
|
||||
const size_t s13 = nb13 / sizeof(float);
|
||||
const size_t s2 = nb2 / sizeof(float);
|
||||
|
|
|
|||
|
|
@ -3871,12 +3871,17 @@ struct ggml_tensor * ggml_get_rows_back(
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c) {
|
||||
GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[0] == c->ne[0]);
|
||||
|
||||
// Support both 2D and 3D: result shape matches c (the source tensor shape)
|
||||
// TODO: implement non F32 return
|
||||
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
|
||||
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
|
||||
struct ggml_tensor * result;
|
||||
if (c->ne[2] > 1) {
|
||||
result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1], c->ne[2]);
|
||||
} else {
|
||||
result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
|
||||
}
|
||||
|
||||
result->op = GGML_OP_GET_ROWS_BACK;
|
||||
result->src[0] = a;
|
||||
|
|
@ -7077,9 +7082,12 @@ void ggml_build_backward_expand(
|
|||
continue;
|
||||
}
|
||||
|
||||
// inplace operations are currently not supported
|
||||
GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
|
||||
node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
|
||||
// inplace operations are currently not supported — warn and skip instead of crashing
|
||||
if (node->view_src && node->op != GGML_OP_CPY && node->op != GGML_OP_VIEW &&
|
||||
node->op != GGML_OP_RESHAPE && node->op != GGML_OP_PERMUTE && node->op != GGML_OP_TRANSPOSE) {
|
||||
GGML_LOG_WARN("%s: skipping unsupported inplace op '%s' in backward graph\n", __func__, ggml_op_name(node->op));
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(ihash != GGML_HASHSET_FULL);
|
||||
|
|
|
|||
|
|
@ -334,16 +334,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||
|
||||
auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
|
||||
|
||||
// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
|
||||
// do not load loras to extra buffer types (i.e. bufts for repacking)
|
||||
// try device-native buft first (keeps LoRA on GPU), fall back to CPU only as last resort
|
||||
for (auto & ex : buft_extra) {
|
||||
if (ex == buft) {
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
// try to get the device's native (non-repack) buffer type
|
||||
auto * dev = ggml_backend_buft_get_device(buft);
|
||||
auto * native_buft = dev ? ggml_backend_dev_buffer_type(dev) : nullptr;
|
||||
if (native_buft && native_buft != buft) {
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use repack buft '%s', using device-native '%s'\n",
|
||||
__func__, model_tensor->name, ggml_backend_buft_name(buft), ggml_backend_buft_name(native_buft));
|
||||
buft = native_buft;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n",
|
||||
__func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue