llama.cpp/tests/gguf-model-data.h

43 lines
1.7 KiB
C++

#pragma once
#include "ggml.h"
#include <cstdint>
#include <optional>
#include <string>
#include <vector>
struct gguf_remote_tensor {
std::string name;
ggml_type type = GGML_TYPE_F32;
int64_t ne[4] = {1, 1, 1, 1}; // dimensions, unused dims = 1
uint32_t n_dims = 0;
};
struct gguf_remote_model {
// Selected KV metadata
std::string architecture; // general.architecture
uint32_t n_embd = 0; // <arch>.embedding_length
uint32_t n_ff = 0; // <arch>.feed_forward_length
uint32_t n_vocab = 0; // inferred from token_embd.weight ne[1]
uint32_t n_layer = 0; // <arch>.block_count
uint32_t n_head = 0; // <arch>.attention.head_count
uint32_t n_head_kv = 0; // <arch>.attention.head_count_kv
uint32_t n_expert = 0; // <arch>.expert_count (0 if absent)
uint32_t n_embd_head_k = 0; // <arch>.attention.key_length
uint32_t n_embd_head_v = 0; // <arch>.attention.value_length
uint16_t n_split = 0; // split.count (0 = not split)
uint32_t n_split_tensors = 0; // split.tensors.count (0 if not split)
std::vector<gguf_remote_tensor> tensors;
};
// Fetch model metadata from HuggingFace with local caching.
// repo: e.g., "ggml-org/Qwen3-32B-GGUF"
// quant: e.g., "Q8_0" -- auto-detects filename (including first shard of split models)
// Returns nullopt if download fails or network is unavailable.
std::optional<gguf_remote_model> gguf_fetch_model_meta(
const std::string & repo,
const std::string & quant = "Q8_0",
const std::string & cache_dir = ""); // empty = default