cont : inital deslop guidelines

This commit is contained in:
Georgi Gerganov 2026-03-15 10:27:24 +02:00
parent 64d6c8817b
commit d576ae3290
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 56 additions and 35 deletions

View File

@ -1,8 +1,14 @@
#pragma once
#include "llama-context.h"
#include "ggml.h"
#include "stdint.h"
#include "llama.h"
// TODO: try to remove this headers
#include "llama-arch.h"
#include "llama-model.h"
#include "llama-quant.h"
#include <cstdint>
#include <vector>
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
@ -10,3 +16,29 @@ LLAMA_API struct ggml_cgraph * llama_graph_reserve(
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs);
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
// TODO: use llama_quant_ prefix to name these consistently:
// Returns true if this tensor should be quantized (based on name, dims, params).
LLAMA_API bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);
// TODO: add:
// LLAMA_API llama_quant * llama_quant_init(...);
// LLAMA_API void llama_quant_free(llama_quant * qnt);
// TODO: become member function of llama_quant
LLAMA_API ggml_type llama_tensor_get_type(
llama_quant & qs,
const llama_model_quantize_params * params,
const ggml_tensor * tensor,
ggml_type default_type,
const tensor_metadata & tm);
// Initialize llama_quant counters and populate tensor_metadata categories.
// metadata: vector with name fields already set, will have category field populated.
// TODO: become member function of llama_quant
LLAMA_API void init_quantize_state_counters(
llama_quant & qs,
std::vector<tensor_metadata> & metadata);

View File

@ -2,6 +2,7 @@
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-model-loader.h"
#include "llama-ext.h"
#include <algorithm>
#include <cmath>
@ -138,7 +139,7 @@ struct compiled_tensor_type_patterns {
std::vector<std::pair<std::regex, ggml_type>> patterns;
};
quantize_state_impl::quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
llama_quant::llama_quant(const llama_model & model, const llama_model_quantize_params * params)
: model(model), params(params)
{
if (params->tensor_types) {
@ -152,7 +153,7 @@ quantize_state_impl::quantize_state_impl(const llama_model & model, const llama_
}
}
quantize_state_impl::~quantize_state_impl() = default;
llama_quant::~llama_quant() = default;
//
// dequantization
@ -302,7 +303,7 @@ bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_
//
// incompatible tensor shapes are handled here - fallback to a compatible type
static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
static ggml_type tensor_type_fallback(llama_quant & qs, const ggml_tensor * t, const ggml_type target_type) {
ggml_type return_type = target_type;
const int64_t ncols = t->ne[0];
@ -351,7 +352,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso
}
// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
static ggml_type llama_tensor_get_type_impl(llama_quant & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
@ -601,7 +602,7 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
}
// outer wrapper: determine the ggml_type that this tensor should be quantized to
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
ggml_type llama_tensor_get_type(llama_quant & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
return tensor->type;
}
@ -776,7 +777,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
}
void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata) {
void init_quantize_state_counters(llama_quant & qs, std::vector<tensor_metadata> & metadata) {
for (auto & tm : metadata) {
tensor_category cat = tensor_get_category(tm.name);
tm.category = cat;
@ -835,7 +836,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
model.load_hparams(ml);
model.load_stats (ml);
quantize_state_impl qs(model, params);
llama_quant qs(model, params);
if (params->only_copy) {
ftype = ml.ftype;

View File

@ -2,16 +2,13 @@
#include "llama.h"
#include "ggml.h"
#include "llama-arch.h"
#include <memory>
#include <string>
#include <vector>
struct llama_model;
// TODO: use llama_quant_ prefix to name these consistently:
// tensor categorization - used to avoid repeated string matching in quantization logic.
// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
enum class tensor_category {
@ -30,6 +27,7 @@ enum class tensor_category {
};
// per-tensor metadata, computed in the preliminary loop and used in the main loop
// TODO: probably should belong to llama_quant
struct tensor_metadata {
std::string name;
ggml_type target_type;
@ -48,7 +46,7 @@ struct tensor_type_option {
struct compiled_tensor_type_patterns;
struct quantize_state_impl {
struct llama_quant {
const llama_model & model;
const llama_model_quantize_params * params;
@ -72,16 +70,6 @@ struct quantize_state_impl {
// tensor type override patterns (compiled once, used in llama_tensor_get_type)
std::unique_ptr<compiled_tensor_type_patterns> tensor_type_patterns;
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params);
~quantize_state_impl();
llama_quant(const llama_model & model, const llama_model_quantize_params * params);
~llama_quant();
};
ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm);
ggml_type llama_ftype_get_default_type(llama_ftype ftype);
// Initialize quantize_state_impl counters and populate tensor_metadata categories.
// metadata: vector with name fields already set, will have category field populated.
void init_quantize_state_counters(quantize_state_impl & qs, std::vector<tensor_metadata> & metadata);
// Returns true if this tensor should be quantized (based on name, dims, params).
bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor);

View File

@ -1,11 +1,9 @@
#include "../src/llama-arch.h"
#include "../src/llama-model.h"
#include "../src/llama-quant.h"
#include "ggml-cpp.h"
#include "ggml.h"
#include "gguf-model-data.h"
#include "llama.h"
#include "../src/llama-ext.h"
#include "gguf-model-data.h"
#include <cstdio>
#include <cstring>
#include <fstream>
@ -323,13 +321,15 @@ static std::string read_file_contents(const std::string & path) {
// ---------------------------------------------------------------------------
// Returns {tensor_name, assigned_type} for each tensor, in order.
// TODO: should likely be moved as a member function of llama_quant and expose through the `llama-ext.h` interface
static std::vector<std::pair<std::string, ggml_type>> compute_quant_types(llama_model & mdl,
const std::vector<mock_tensor> & tensors,
llama_ftype ftype) {
llama_model_quantize_params qparams = llama_model_quantize_default_params();
qparams.ftype = ftype;
quantize_state_impl qs(mdl, &qparams);
// TODO: call llama_quant_init(...)
llama_quant qs(mdl, &qparams);
std::vector<tensor_metadata> metadata(tensors.size());
for (size_t i = 0; i < tensors.size(); ++i) {