diff --git a/common/sampling.cpp b/common/sampling.cpp index c33d58ae5e..27b2a082b0 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -691,4 +691,4 @@ llama_token common_sampler_sample_speculative(struct common_sampler * gsmpl, str } return best_id; -} \ No newline at end of file +} diff --git a/common/sampling.h b/common/sampling.h index c7101032f2..90c2401c2f 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -115,3 +115,5 @@ struct common_sampler_deleter { }; typedef std::unique_ptr common_sampler_ptr; + +llama_token common_sampler_sample_speculative(struct common_sampler * gsmpl, struct llama_context * ctx, int idx); diff --git a/common/speculative.cpp b/common/speculative.cpp index 136f2c1b1a..548394bbe8 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -361,8 +361,8 @@ llama_tokens common_speculative_gen_draft( } llama_tokens mtp_speculative_gen_draft( - struct common_sampler* smpl, - struct llama_context* ctx, + struct common_sampler * smpl, + struct llama_context * ctx, struct common_speculative_params params, llama_token id_last, int32_t n_past, diff --git a/common/speculative.h b/common/speculative.h index a33c5a8b02..d22a752d3f 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -57,8 +57,8 @@ llama_tokens common_speculative_gen_draft( * @return std::vector The generated draft tokens. */ llama_tokens mtp_speculative_gen_draft( - struct common_sampler* smpl, - struct llama_context* ctx, + struct common_sampler * smpl, + struct llama_context * ctx, struct common_speculative_params params, llama_token id_last, int32_t n_past, diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 925b9b805d..dca005da35 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2001,7 +2001,7 @@ struct server_context_impl { llama_set_draft_input_hidden_state(ctx, llama_get_embeddings_ith(ctx, -1)); draft = mtp_speculative_gen_draft( - slot.smpl, + slot.smpl.get(), ctx, params_spec, slot.sampled,