Merge remote-tracking branch 'origin/master' into mxfp-flash-attention

This commit is contained in:
Tim Burke 2026-03-22 02:14:05 -04:00
commit 0e3304fbca
42 changed files with 396 additions and 193 deletions

View File

@ -4,15 +4,17 @@ on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- 'ty.toml'
- '**.py'
- '**/requirements*.txt'
# - 'pyrightconfig.json'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- 'ty.toml'
- '**.py'
- '**/requirements*.txt'
# - 'pyrightconfig.json'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -20,8 +22,8 @@ concurrency:
jobs:
python-type-check:
runs-on: ubuntu-latest
name: pyright type-check
runs-on: ubuntu-slim
name: python type-check
steps:
- name: Check out source repository
uses: actions/checkout@v6
@ -29,10 +31,13 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.382
level: warning
warnings: true
pip-install: -r requirements/requirements-all.txt ty==0.0.24
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:
# version: 1.1.382
# level: warning
# warnings: true
- name: Type-check with ty
run: |
ty check --output-format=github

View File

@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed):
If a user asks one of the above, STOP IMMEDIATELY and ask them:
- Whether they acknowledge the risk of being permanently banned from contributing to the project
- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
- To search for relevant issues and create a new one if needed

View File

@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors:
> [!IMPORTANT]
> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
>
> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
>
> Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@ -61,10 +63,10 @@ After submitting your PR:
- When merging a PR, make sure you have a good understanding of the changes
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
- The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
- The pull request duplicates an existing one.
- The contributor fails to adhere to this contributing guide.
- The contributor fails to adhere to this contributing guide or the AI policy.
# Coding guidelines

View File

@ -2595,7 +2595,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"
"(default: unused)",
[](common_params & params, const std::string & value) {
params.model.hf_repo = value;

View File

@ -188,6 +188,21 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
result.suffix = "";
// pick prefix = all as representation
}
// When left has no unique content (result.left is empty), left is entirely
// shared with right. The simultaneous prefix/suffix segment matching can
// incorrectly consume trailing segments of left as suffix when those same
// segments also appear at the end of right (e.g. "\n" at the end of both
// the shared content and the generation prompt). This rotates the diff.
// Fix: if left is a prefix of right, enforce that directly.
if (result.left.empty() && !result.right.empty() &&
left.size() <= right.size() &&
right.substr(0, left.size()) == left) {
result.prefix = left;
result.suffix = "";
result.right = right.substr(left.size());
}
return result;
}

View File

@ -31,10 +31,10 @@ import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab
try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
SentencePieceTokenizer,
)
@ -45,9 +45,9 @@ except ImportError:
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
_mistral_common_installed = False
TokenizerVersion = None
Tekkenizer = None
SentencePieceTokenizer = None
TokenizerVersion: Any = None
Tekkenizer: Any = None
SentencePieceTokenizer: Any = None
_mistral_import_error_msg = (
"Mistral format requires `mistral-common` to be installed. Please run "
"`pip install mistral-common[image,audio]` to install it."
@ -145,6 +145,7 @@ class ModelBase:
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self._is_nvfp4 = False
self._is_mxfp4 = False
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@ -220,7 +221,7 @@ class ModelBase:
if weight_map is None or not isinstance(weight_map, dict):
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
tensor_names_from_index.update(weight_map.keys())
part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment]
part_names = sorted(part_dict.keys())
else:
weight_map = {}
@ -712,6 +713,7 @@ class ModelBase:
def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
quant_config_file = self.dir_model / "hf_quant_config.json"
@ -728,6 +730,7 @@ class ModelBase:
quant_algo = "NVFP4"
self._is_nvfp4 = quant_algo == "NVFP4"
self._is_mxfp4 = quant_method == "mxfp4"
# NVFP4 weights are repacked and written directly to gguf_writer.
# This must run before dequant_model so NVFP4 tensors are removed
@ -876,6 +879,12 @@ class ModelBase:
if self.metadata.name is None:
self.metadata.name = self.dir_model.name
if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16):
if self._is_nvfp4:
self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4
elif self._is_mxfp4:
self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
# Generate parameter weight class (useful for leader boards) if not yet determined
if self.metadata.size_label is None and total_params > 0:
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
@ -5882,7 +5891,7 @@ class InternLM2Model(TextModel):
logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1)
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
@ -6203,7 +6212,7 @@ class BertModel(TextModel):
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
@ -8880,7 +8889,7 @@ class T5Model(TextModel):
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -9017,7 +9026,7 @@ class T5EncoderModel(TextModel):
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -11125,8 +11134,7 @@ class GptOssModel(TextModel):
# TODO: remove once MXFP4 is supported more generally
def dequant_model(self):
quant_config = self.hparams.get("quantization_config")
if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
if self._is_mxfp4:
return
return super().dequant_model()
@ -12279,6 +12287,7 @@ class LazyTorchTensor(gguf.LazyBase):
kwargs = {}
if func is torch.Tensor.numpy:
assert len(args)
return args[0].numpy()
return cls._wrap_fn(func)(*args, **kwargs)

View File

@ -112,11 +112,11 @@ class Tensor:
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length'
quant = gguf.GGML_QUANT_SIZES.get(dtype)
self.dtype = gguf.GGMLQuantizationType(dtype)
quant = gguf.GGML_QUANT_SIZES.get(self.dtype)
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])

View File

@ -199,10 +199,13 @@ class LoraTorchTensor:
kwargs = {}
if func is torch.permute:
assert len(args)
return type(args[0]).permute(*args, **kwargs)
elif func is torch.reshape:
assert len(args)
return type(args[0]).reshape(*args, **kwargs)
elif func is torch.stack:
assert len(args)
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
@ -211,6 +214,7 @@ class LoraTorchTensor:
torch.stack([b._lora_B for b in args[0]], dim),
)
elif func is torch.cat:
assert len(args)
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
@ -362,7 +366,7 @@ if __name__ == '__main__':
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
class LoraModel(model_class):
class LoraModel(model_class): # ty: ignore[unsupported-base]
model_arch = model_class.model_arch
lora_alpha: float

View File

@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
return f'({result})?' if min_items == 0 else result
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
has_min = min_value != None
has_max = max_value != None
def digit_range(from_char: str, to_char: str):
out.append("[")
if from_char == to_char:
@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
out.append(to_str[i])
out.append("]")
if has_min and has_max:
if min_value is not None and max_value is not None:
if min_value < 0 and max_value < 0:
out.append("\"-\" (")
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
less_decimals = max(decimals_left - 1, 1)
if has_min:
if min_value is not None:
if min_value < 0:
out.append("\"-\" (")
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
more_digits(length - 1, less_decimals)
return
if has_max:
if max_value is not None:
if max_value >= 0:
if top_level:
out.append("\"-\" [1-9] ")

View File

@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
print("Using SentenceTransformer to apply all numbered layers")
model = SentenceTransformer(model_path)
tokenizer = model.tokenizer
config = model[0].auto_model.config # type: ignore
config = model[0].auto_model.config
else:
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
print(f"Model file: {type(model).__module__}")
# Verify the model is using the correct sliding window
if hasattr(model.config, 'sliding_window'): # type: ignore
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
if hasattr(model.config, 'sliding_window'):
print(f"Model's sliding_window: {model.config.sliding_window}")
else:
print("Model config does not have sliding_window attribute")
@ -152,7 +152,7 @@ def main():
device = next(model.parameters()).device
else:
# For SentenceTransformer, get device from the underlying model
device = next(model[0].auto_model.parameters()).device # type: ignore
device = next(model[0].auto_model.parameters()).device
model_name = os.path.basename(model_path)
@ -177,7 +177,7 @@ def main():
print(f"{token_id:6d} -> '{token_str}'")
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")
else:
# Standard approach: use base model output only
encoded = tokenizer(
@ -205,12 +205,12 @@ def main():
print(f"Embedding dimension: {all_embeddings.shape[1]}")
if len(all_embeddings.shape) == 1:
n_embd = all_embeddings.shape[0] # type: ignore
n_embd = all_embeddings.shape[0]
n_embd_count = 1
all_embeddings = all_embeddings.reshape(1, -1)
else:
n_embd = all_embeddings.shape[1] # type: ignore
n_embd_count = all_embeddings.shape[0] # type: ignore
n_embd = all_embeddings.shape[1]
n_embd_count = all_embeddings.shape[0]
print()

View File

@ -2,7 +2,7 @@
import argparse
import sys
from common import compare_tokens # type: ignore
from common import compare_tokens # type: ignore[import-not-found]
def parse_arguments():

View File

@ -6,7 +6,7 @@ import re
from copy import copy
from enum import Enum
from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
from docstring_parser import parse
from pydantic import BaseModel, create_model
@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
# Assert that the parameter has a type annotation
if param.annotation == inspect.Parameter.empty:
raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")
raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""")
# Find the parameter's description in the docstring
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
# Assert that the parameter has a description
if not param_doc or not param_doc.description:
raise ValueError(
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""")
# Add parameter details to the schema
param_docs.append((param.name, param_doc))
@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
dynamic_fields[param.name] = (
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
# Creating the dynamic model
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields)
for name, param_doc in param_docs:
dynamic_model.model_fields[name].description = param_doc.description
@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
if items != {}:
array = {"properties": items}
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
fields[field_name] = (List[array_type], ...)
fields[field_name] = (list[array_type], ...) # ty: ignore[invalid-type-form]
else:
fields[field_name] = (list, ...)
elif field_type == "object":

View File

@ -3194,6 +3194,7 @@ class tinyBLAS_PPC {
private:
__attribute__((always_inline))
inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
vec_t vec_C[4];
__builtin_mma_disassemble_acc(vec_C, ACC);
@ -3204,6 +3205,7 @@ class tinyBLAS_PPC {
}
}
__attribute__((always_inline))
inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
vec_t vec_C[4];
__builtin_mma_disassemble_acc(vec_C, ACC);

View File

@ -1162,12 +1162,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
return nullptr;
}
// Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types)
if (ggml_blck_size((enum ggml_type)tensor->type) == 0) {
GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type);
return nullptr;
}
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
// ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
if (result == nullptr) {
GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type);
return nullptr;
}

View File

@ -16048,6 +16048,7 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
case 0xE20C: // B570
return 18;
case 0xE20B: // B580
case 0xE211: // Pro B60
return 20;
default:
return 0;

View File

@ -3869,6 +3869,8 @@ class LlamaFileType(IntEnum):
# MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
MOSTLY_TQ1_0 = 36 # except 1d tensors
MOSTLY_TQ2_0 = 37 # except 1d tensors
MOSTLY_MXFP4_MOE = 38 # except 1d tensors
MOSTLY_NVFP4 = 39 # except 1d tensors
GUESSED = 1024 # not specified in the model file

View File

@ -1300,7 +1300,7 @@ class GGUFWriter:
else:
raise ValueError("Invalid GGUF metadata value type or value")
return kv_data
return bytes(kv_data)
@staticmethod
def format_n_bytes_to_str(num: int) -> str:

View File

@ -138,7 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
if isinstance(meta_noop, tuple):
dtype, shape = meta_noop
assert callable(shape)
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) # ty: ignore[call-top-callable]
else:
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)

View File

@ -91,11 +91,11 @@ class __Quant(ABC):
def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
cls.qtype = qtype
cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
cls.__quantize_lazy: Any = LazyNumpyTensor._wrap_fn(
cls.__quantize_array,
meta_noop=(np.uint8, cls.__shape_to_bytes)
)
cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
cls.__dequantize_lazy: Any = LazyNumpyTensor._wrap_fn(
cls.__dequantize_array,
meta_noop=(np.float32, cls.__shape_from_bytes)
)

View File

@ -11,33 +11,33 @@ from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVa
try:
from sentencepiece import SentencePieceProcessor
except ImportError:
SentencePieceProcessor = None
SentencePieceProcessor: Any = None
try:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
_filter_valid_tokenizer_files,
)
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
SentencePieceTokenizer,
)
except ImportError:
_mistral_common_installed = False
MistralTokenizer = None
Tekkenizer = None
SentencePieceTokenizer = None
_filter_valid_tokenizer_files = None
MistralTokenizer: Any = None
Tekkenizer: Any = None
SentencePieceTokenizer: Any = None
_filter_valid_tokenizer_files: Any = None
else:
_mistral_common_installed = True
try:
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
get_one_valid_tokenizer_file,
)
except ImportError:
# We still want the conversion to work with older mistral-common versions.
get_one_valid_tokenizer_file = None
get_one_valid_tokenizer_file: Any = None
import gguf
@ -703,7 +703,7 @@ class MistralVocab(Vocab):
tokenizer_file_path = base_path / tokenizer_file
self.tokenizer = MistralTokenizer.from_file(
self.tokenizer: Any = MistralTokenizer.from_file(
tokenizer_file_path
).instruct_tokenizer.tokenizer
self.tokenizer_type = (

View File

@ -1,5 +1,5 @@
{
"extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
"extraPaths": ["gguf-py", "examples/model-conversion/scripts", "examples/model-conversion/scripts/utils"],
"pythonVersion": "3.9",
"pythonPlatform": "All",
"reportUnusedImport": "warning",

View File

@ -684,6 +684,7 @@ else:
sys.exit(1)
assert isinstance(hexsha8_baseline, str)
name_baseline = bench_data.get_commit_name(hexsha8_baseline)
hexsha8_compare = name_compare = None
@ -717,6 +718,7 @@ else:
parser.print_help()
sys.exit(1)
assert isinstance(hexsha8_compare, str)
name_compare = bench_data.get_commit_name(hexsha8_compare)
# Get tool-specific configuration

View File

@ -241,10 +241,10 @@ class CodeEditor(QPlainTextEdit):
if not self.isReadOnly():
selection = QTextEdit.ExtraSelection()
line_color = QColorConstants.Yellow.lighter(160)
selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue]
selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue]
selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue]
selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue]
selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
extra_selections.append(selection)
self.setExtraSelections(extra_selections)
@ -262,8 +262,8 @@ class CodeEditor(QPlainTextEdit):
)
extra = QTextEdit.ExtraSelection()
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue]
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue]
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
self.setExtraSelections(self.extraSelections() + [extra])
@ -274,8 +274,8 @@ class CodeEditor(QPlainTextEdit):
cursor.select(QTextCursor.SelectionType.LineUnderCursor)
extra = QTextEdit.ExtraSelection()
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue]
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue]
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
self.setExtraSelections(self.extraSelections() + [extra])
@ -395,8 +395,8 @@ class JinjaTester(QMainWindow):
ensure_ascii=ensure_ascii,
)
)
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
env.globals["raise_exception"] = raise_exception
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment]
env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment]
try:
template = env.from_string(template_str)
output = template.render(context)

View File

@ -189,6 +189,7 @@ def benchmark(
data: list[dict] = []
assert isinstance(prompts, list)
for i, p in enumerate(prompts):
if seed_offset >= 0:
random.seed(3 * (seed_offset + 1000 * i) + 1)

View File

@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id];
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
const uint32_t n_embd_out = hparams.n_embd_out();
embd_seq_out[seq_id].resize(n_embd_out);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_RANK:
@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
// extract sequence embeddings (cleared before processing each batch)
auto & embd_seq_out = embd_seq;
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
const uint32_t n_embd_out = hparams.n_embd_out();
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id];
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
embd_seq_out[seq_id].resize(n_embd_out);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_RANK:

View File

@ -7,6 +7,7 @@
#include <cmath>
#include <algorithm>
#include <cstdint>
#include <set>
#include <stdexcept>
#define MAX_REPETITION_THRESHOLD 2000
@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence(
bool is_nested) {
size_t last_sym_start = rule.size();
const char * pos = src;
uint64_t n_prev_rules = 1;
// use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
// (though it's technically the same as -1 now)
@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence(
// S' ::= S |
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
// Calculate the total number of rules that will be generated by this repetition
uint64_t total_rules = 1; // Start with 1 for the original rule
if (!no_max && max_times > 0) {
total_rules = max_times;
} else if (min_times > 0) {
total_rules = min_times;
}
if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) {
throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity");
}
if (min_times == 0) {
rule.resize(last_sym_start);
} else {
@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence(
if (n_opt > 0) {
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
n_prev_rules *= total_rules;
GGML_ASSERT(n_prev_rules >= 1);
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = rule.size();
n_prev_rules = 1;
while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence(
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = rule.size();
n_prev_rules = 1;
while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence(
auto token_pair = parse_token(vocab, pos);
const char * token_end = token_pair.second;
last_sym_start = rule.size();
n_prev_rules = 1;
rule.push_back({type, token_pair.first});
pos = parse_space(token_end, is_nested);
} else if (is_word_char(*pos)) { // rule reference
@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence(
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = rule.size();
n_prev_rules = 1;
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t n_rules_before = symbol_ids.size();
uint32_t sub_rule_id = generate_symbol_id(rule_name);
pos = parse_alternates(pos, rule_name, sub_rule_id, true);
n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before);
last_sym_start = rule.size();
// output reference to synthesized rule
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence(
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '.') { // any char
last_sym_start = rule.size();
n_prev_rules = 1;
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
@ -830,32 +853,54 @@ static bool llama_grammar_match_token(
static void llama_grammar_advance_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
llama_grammar_stacks & new_stacks) {
if (stack.empty()) {
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
new_stacks.emplace_back(stack);
llama_grammar_stacks & new_stacks) {
std::vector<llama_grammar_stack> todo;
todo.push_back(stack);
auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) {
return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(),
[](const llama_grammar_element * pa, const llama_grammar_element * pb) {
return pa < pb; // Compare pointer addresses
}
);
};
std::set<llama_grammar_stack, decltype(stack_cmp)> seen(stack_cmp);
while (!todo.empty()) {
llama_grammar_stack curr_stack = std::move(todo.back());
todo.pop_back();
if (seen.find( curr_stack) != seen.end()) {
continue;
}
return;
}
seen.insert(curr_stack);
const llama_grammar_element * pos = stack.back();
if (curr_stack.empty()) {
if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
new_stacks.emplace_back(std::move(curr_stack));
}
continue;
}
switch (pos->type) {
const llama_grammar_element * pos = curr_stack.back();
switch (pos->type) {
case LLAMA_GRETYPE_RULE_REF: {
const size_t rule_id = static_cast<size_t>(pos->value);
const llama_grammar_element * subpos = rules[rule_id].data();
do {
// init new stack without the top (pos)
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
// if this rule ref is followed by another element, add that to stack
new_stack.push_back(pos + 1);
next_stack.push_back(pos + 1);
}
if (!llama_grammar_is_end_of_sequence(subpos)) {
// if alternate is nonempty, add to stack
new_stack.push_back(subpos);
next_stack.push_back(subpos);
}
llama_grammar_advance_stack(rules, new_stack, new_stacks);
todo.push_back(std::move(next_stack));
while (!llama_grammar_is_end_of_sequence(subpos)) {
// scan to end of alternate def
subpos++;
@ -874,9 +919,9 @@ static void llama_grammar_advance_stack(
case LLAMA_GRETYPE_CHAR_ANY:
case LLAMA_GRETYPE_TOKEN:
case LLAMA_GRETYPE_TOKEN_NOT:
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
// only add the stack if it's not a duplicate of one we already have
new_stacks.emplace_back(stack);
new_stacks.emplace_back(std::move(curr_stack));
}
break;
default:
@ -884,6 +929,7 @@ static void llama_grammar_advance_stack(
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those
GGML_ABORT("fatal error");
}
}
}

View File

@ -22,6 +22,7 @@ static void test_calculate_diff_split_no_common(testing & t);
static void test_calculate_diff_split_single_char(testing & t);
static void test_calculate_diff_split_overlaps(testing & t);
static void test_calculate_diff_split_tag_boundaries(testing & t);
static void test_calculate_diff_split_generation_prompt(testing & t);
static void test_calculate_diff_split(testing & t);
static void test_until_common_prefix_basic(testing & t);
@ -179,6 +180,7 @@ static void test_calculate_diff_split(testing & t) {
t.test("calculate_diff_split single char", test_calculate_diff_split_single_char);
t.test("calculate_diff_split overlaps", test_calculate_diff_split_overlaps);
t.test("calculate_diff_split tag boundaries", test_calculate_diff_split_tag_boundaries);
t.test("calculate_diff_split generation prompt", test_calculate_diff_split_generation_prompt);
}
static void test_calculate_diff_split_basic(testing & t) {
@ -502,6 +504,39 @@ static void test_calculate_diff_split_tag_boundaries(testing & t) {
}
}
static void test_calculate_diff_split_generation_prompt(testing & t) {
// ChatML thinking template: left is a prefix of right, generation_prompt is the appended part.
// The trailing \n in left matches the trailing \n in the generation_prompt, causing
// the suffix matcher to steal it and rotate the diff result.
{
// Simplified reproduction: left ends with \n, right = left + "<|im_start|>assistant\n<think>\n"
std::string left = "<|im_start|>user\nHello<|im_end|>\n";
std::string right = left + "<|im_start|>assistant\n<think>\n";
diff_split result = calculate_diff_split(left, right);
t.assert_equal("chatml prefix", left, result.prefix);
t.assert_equal("chatml left", "", result.left);
t.assert_equal("chatml right should be generation prompt",
"<|im_start|>assistant\n<think>\n", result.right);
t.assert_equal("chatml suffix", "", result.suffix);
}
{
// More realistic: longer conversation ending with tool_response
std::string common =
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\nSearch for files<|im_end|>\n"
"<|im_start|>assistant\n<think>\nLet me search.\n</think>\n\n"
"<tool_call>\n<function=search>\n</function>\n</tool_call><|im_end|>\n"
"<|im_start|>user\n<tool_response>\nNo files found\n</tool_response><|im_end|>\n";
std::string left = common;
std::string right = common + "<|im_start|>assistant\n<think>\n";
diff_split result = calculate_diff_split(left, right);
t.assert_equal("tool_response left", "", result.left);
t.assert_equal("tool_response right should be generation prompt",
"<|im_start|>assistant\n<think>\n", result.right);
}
}
static void test_until_common_prefix(testing & t) {
t.test("until_common_prefix basic", test_until_common_prefix_basic);
}

View File

@ -1337,7 +1337,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
.expect_content("<think>I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.expect_content("<think>\nI'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.run();
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")

View File

@ -788,6 +788,24 @@ static void test_quantifiers() {
"0xFF 0x12 0xAB 0x00 0x00 0x00",
}
);
test_grammar(
"segfault",
// Grammar
R"""(
root ::= ( [x]* )*
)""",
// Passing strings
{
"",
"x",
"xx"
},
// Failing strings
{
"y",
"yy"
}
);
}
static void test_failure_missing_root() {

View File

@ -145,6 +145,10 @@ int main()
root ::= "a"{,}"
)""");
verify_failure(R"""(
root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99}
)""");
verify_failure(R"""(
root ::= "a"{,10}"
)""");

View File

@ -123,25 +123,27 @@ int main()
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
@ -149,26 +151,24 @@ int main()
{LLAMA_GRETYPE_CHAR, 40},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
}};
auto index = 0;
@ -195,9 +195,9 @@ int main()
}
std::vector<llama_grammar_candidate> next_candidates;
next_candidates.resize(24);
next_candidates.resize(23);
for (size_t i = 0; i < 24; ++i)
for (size_t i = 0; i < 23; ++i)
{
uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
cp[0] = 37 + i;
@ -210,7 +210,6 @@ int main()
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
@ -268,6 +267,7 @@ int main()
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
@ -287,13 +287,11 @@ int main()
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
@ -351,6 +349,7 @@ int main()
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
@ -370,7 +369,6 @@ int main()
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
};

View File

@ -16,8 +16,7 @@ import random
import unicodedata
from pathlib import Path
from typing import Any, Iterator, cast
from typing_extensions import Buffer
from typing import Any, Iterator
import cffi
from transformers import AutoTokenizer, PreTrainedTokenizer
@ -114,7 +113,7 @@ class LibLlamaModel:
while num < 0 and len(self.text_buff) < (16 << 20):
self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType]
class Tokenizer:
@ -438,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
decode_errors = 0
MAX_ERRORS = 10
logger.info("%s: %s" % (generator.__qualname__, "ini"))
logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini"))
for text in generator:
# print(repr(text), text.encode())
# print(repr(text), hex(ord(text[0])), text.encode())
@ -477,7 +476,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
break
t_total = time.perf_counter() - t_start
logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
logger.info(f"{getattr(generator, '__qualname__', '')}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
def main(argv: list[str] | None = None):

View File

@ -83,7 +83,7 @@
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |

View File

@ -166,7 +166,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |

View File

@ -418,7 +418,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n");
printf(" example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n");
printf(" (default: unused)\n");
printf(" -hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
printf(" (default: unused)\n");

View File

@ -100,7 +100,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |

View File

@ -285,7 +285,7 @@ def start_server_background(args):
}
server_process = subprocess.Popen(
args,
**pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue]
**pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] # ty: ignore[no-matching-overload]
def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b''):

View File

@ -415,6 +415,7 @@ task_params server_task::params_from_json_cmpl(
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str());
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());

View File

@ -9,6 +9,7 @@ sys.path.insert(0, str(path))
from utils import *
from enum import Enum
from typing import TypedDict
server: ServerProcess
@ -29,56 +30,73 @@ class CompletionMode(Enum):
NORMAL = "normal"
STREAMED = "streamed"
TEST_TOOL = {
"type":"function",
"function": {
"name": "test",
"description": "",
"parameters": {
"type": "object",
"properties": {
"success": {"type": "boolean", "const": True},
},
"required": ["success"]
}
}
}
class ToolParameters(TypedDict):
type: str
properties: dict[str, dict]
required: list[str]
PYTHON_TOOL = {
"type": "function",
"function": {
"name": "python",
"description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
"parameters": {
"type": "object",
"properties": {
class ToolFunction(TypedDict):
name: str
description: str
parameters: ToolParameters
class ToolDefinition(TypedDict):
type: str
function: ToolFunction
TEST_TOOL = ToolDefinition(
type = "function",
function = ToolFunction(
name = "test",
description = "",
parameters = ToolParameters(
type = "object",
properties = {
"success": {
"type": "boolean",
"const": True,
},
},
required = ["success"],
),
),
)
PYTHON_TOOL = ToolDefinition(
type = "function",
function = ToolFunction(
name = "python",
description = "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
parameters = ToolParameters(
type = "object",
properties = {
"code": {
"type": "string",
"description": "The code to run in the ipython interpreter."
}
"description": "The code to run in the ipython interpreter.",
},
},
"required": ["code"]
}
}
}
required = ["code"],
),
),
)
WEATHER_TOOL = {
"type":"function",
"function":{
"name":"get_current_weather",
"description":"Get the current weather in a given location",
"parameters":{
"type":"object",
"properties":{
"location":{
"type":"string",
"description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
}
},
"required":["location"]
}
}
}
WEATHER_TOOL = ToolDefinition(
type = "function",
function = ToolFunction(
name = "get_current_weather",
description = "Get the current weather in a given location",
parameters = ToolParameters(
type = "object",
properties = {
"location": {
"type": "string",
"description": "The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'",
},
},
required = ["location"],
),
),
)
def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
body = server.make_any_request("POST", "/v1/chat/completions", data={

View File

@ -127,7 +127,7 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
fullHeightCodeBlocks:
'Always display code blocks at their full natural height, overriding any height limits.',
showRawModelNames:
'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.',
'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.',
mcpServers:
'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.',
mcpServerUsageStats:

View File

@ -457,7 +457,7 @@ class ModelsStore {
/**
* Select a model by its model name (used for syncing with conversation model)
* @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest")
* @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF")
*/
selectModelByName(modelName: string): void {
const option = this.models.find((model) => model.model === modelName);

30
ty.toml Normal file
View File

@ -0,0 +1,30 @@
[environment]
extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
python-version = "3.10"
[rules]
deprecated = "warn"
[src]
exclude = [
"./tools/mtmd/legacy-models/**",
]
[[overrides]]
include = [
"./tools/server/tests/**",
]
[overrides.rules]
unresolved-reference = "ignore"
unresolved-import = "ignore"
unresolved-attribute = "ignore"
[[overrides]]
include = [
"./examples/pydantic_models_to_grammar.py",
]
[overrides.rules]
unsupported-operator = "ignore"
not-subscriptable = "ignore"