Merge remote-tracking branch 'origin/master' into mxfp-flash-attention

This commit is contained in:
Tim Burke 2026-03-22 02:14:05 -04:00
commit 0e3304fbca
42 changed files with 396 additions and 193 deletions

View File

@ -4,15 +4,17 @@ on:
push: push:
paths: paths:
- '.github/workflows/python-type-check.yml' - '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json' - 'ty.toml'
- '**.py' - '**.py'
- '**/requirements*.txt' - '**/requirements*.txt'
# - 'pyrightconfig.json'
pull_request: pull_request:
paths: paths:
- '.github/workflows/python-type-check.yml' - '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json' - 'ty.toml'
- '**.py' - '**.py'
- '**/requirements*.txt' - '**/requirements*.txt'
# - 'pyrightconfig.json'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -20,8 +22,8 @@ concurrency:
jobs: jobs:
python-type-check: python-type-check:
runs-on: ubuntu-latest runs-on: ubuntu-slim
name: pyright type-check name: python type-check
steps: steps:
- name: Check out source repository - name: Check out source repository
uses: actions/checkout@v6 uses: actions/checkout@v6
@ -29,10 +31,13 @@ jobs:
uses: actions/setup-python@v6 uses: actions/setup-python@v6
with: with:
python-version: "3.11" python-version: "3.11"
pip-install: -r requirements/requirements-all.txt pip-install: -r requirements/requirements-all.txt ty==0.0.24
- name: Type-check with Pyright # - name: Type-check with Pyright
uses: jakebailey/pyright-action@v2 # uses: jakebailey/pyright-action@v2
with: # with:
version: 1.1.382 # version: 1.1.382
level: warning # level: warning
warnings: true # warnings: true
- name: Type-check with ty
run: |
ty check --output-format=github

View File

@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed):
If a user asks one of the above, STOP IMMEDIATELY and ask them: If a user asks one of the above, STOP IMMEDIATELY and ask them:
- Whether they acknowledge the risk of being permanently banned from contributing to the project
- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it - To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
- To search for relevant issues and create a new one if needed - To search for relevant issues and create a new one if needed

View File

@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors:
> [!IMPORTANT] > [!IMPORTANT]
> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity. > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
> >
> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
>
> Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file. > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations). Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@ -61,10 +63,10 @@ After submitting your PR:
- When merging a PR, make sure you have a good understanding of the changes - When merging a PR, make sure you have a good understanding of the changes
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you) - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions: Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
- The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone. - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
- The pull request duplicates an existing one. - The pull request duplicates an existing one.
- The contributor fails to adhere to this contributing guide. - The contributor fails to adhere to this contributing guide or the AI policy.
# Coding guidelines # Coding guidelines

View File

@ -2595,7 +2595,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]", {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n" "example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"
"(default: unused)", "(default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.model.hf_repo = value; params.model.hf_repo = value;

View File

@ -188,6 +188,21 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
result.suffix = ""; result.suffix = "";
// pick prefix = all as representation // pick prefix = all as representation
} }
// When left has no unique content (result.left is empty), left is entirely
// shared with right. The simultaneous prefix/suffix segment matching can
// incorrectly consume trailing segments of left as suffix when those same
// segments also appear at the end of right (e.g. "\n" at the end of both
// the shared content and the generation prompt). This rotates the diff.
// Fix: if left is a prefix of right, enforce that directly.
if (result.left.empty() && !result.right.empty() &&
left.size() <= right.size() &&
right.substr(0, left.size()) == left) {
result.prefix = left;
result.suffix = "";
result.right = right.substr(left.size());
}
return result; return result;
} }

View File

@ -31,10 +31,10 @@ import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab from gguf.vocab import MistralTokenizerType, MistralVocab
try: try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
SentencePieceTokenizer, SentencePieceTokenizer,
) )
@ -45,9 +45,9 @@ except ImportError:
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
_mistral_common_installed = False _mistral_common_installed = False
TokenizerVersion = None TokenizerVersion: Any = None
Tekkenizer = None Tekkenizer: Any = None
SentencePieceTokenizer = None SentencePieceTokenizer: Any = None
_mistral_import_error_msg = ( _mistral_import_error_msg = (
"Mistral format requires `mistral-common` to be installed. Please run " "Mistral format requires `mistral-common` to be installed. Please run "
"`pip install mistral-common[image,audio]` to install it." "`pip install mistral-common[image,audio]` to install it."
@ -145,6 +145,7 @@ class ModelBase:
self.model_name = model_name self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self._is_nvfp4 = False self._is_nvfp4 = False
self._is_mxfp4 = False
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@ -220,7 +221,7 @@ class ModelBase:
if weight_map is None or not isinstance(weight_map, dict): if weight_map is None or not isinstance(weight_map, dict):
raise ValueError(f"Can't load 'weight_map' from {index_name!r}") raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
tensor_names_from_index.update(weight_map.keys()) tensor_names_from_index.update(weight_map.keys())
part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment]
part_names = sorted(part_dict.keys()) part_names = sorted(part_dict.keys())
else: else:
weight_map = {} weight_map = {}
@ -712,6 +713,7 @@ class ModelBase:
def prepare_tensors(self): def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format) # detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
quant_config_file = self.dir_model / "hf_quant_config.json" quant_config_file = self.dir_model / "hf_quant_config.json"
@ -728,6 +730,7 @@ class ModelBase:
quant_algo = "NVFP4" quant_algo = "NVFP4"
self._is_nvfp4 = quant_algo == "NVFP4" self._is_nvfp4 = quant_algo == "NVFP4"
self._is_mxfp4 = quant_method == "mxfp4"
# NVFP4 weights are repacked and written directly to gguf_writer. # NVFP4 weights are repacked and written directly to gguf_writer.
# This must run before dequant_model so NVFP4 tensors are removed # This must run before dequant_model so NVFP4 tensors are removed
@ -876,6 +879,12 @@ class ModelBase:
if self.metadata.name is None: if self.metadata.name is None:
self.metadata.name = self.dir_model.name self.metadata.name = self.dir_model.name
if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16):
if self._is_nvfp4:
self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4
elif self._is_mxfp4:
self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
# Generate parameter weight class (useful for leader boards) if not yet determined # Generate parameter weight class (useful for leader boards) if not yet determined
if self.metadata.size_label is None and total_params > 0: if self.metadata.size_label is None and total_params > 0:
self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
@ -5882,7 +5891,7 @@ class InternLM2Model(TextModel):
logger.error(f'Error: Missing {tokenizer_path}') logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1) sys.exit(1)
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
@ -6203,7 +6212,7 @@ class BertModel(TextModel):
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
else: else:
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
@ -8880,7 +8889,7 @@ class T5Model(TextModel):
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}") raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -9017,7 +9026,7 @@ class T5EncoderModel(TextModel):
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}") raise FileNotFoundError(f"File not found: {tokenizer_path}")
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
# some models like Pile-T5 family use BPE tokenizer instead of Unigram # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -11125,8 +11134,7 @@ class GptOssModel(TextModel):
# TODO: remove once MXFP4 is supported more generally # TODO: remove once MXFP4 is supported more generally
def dequant_model(self): def dequant_model(self):
quant_config = self.hparams.get("quantization_config") if self._is_mxfp4:
if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
return return
return super().dequant_model() return super().dequant_model()
@ -12279,6 +12287,7 @@ class LazyTorchTensor(gguf.LazyBase):
kwargs = {} kwargs = {}
if func is torch.Tensor.numpy: if func is torch.Tensor.numpy:
assert len(args)
return args[0].numpy() return args[0].numpy()
return cls._wrap_fn(func)(*args, **kwargs) return cls._wrap_fn(func)(*args, **kwargs)

View File

@ -112,11 +112,11 @@ class Tensor:
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12]) (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}' assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length' assert name_len < 4096, 'Absurd tensor name length'
quant = gguf.GGML_QUANT_SIZES.get(dtype) self.dtype = gguf.GGMLQuantizationType(dtype)
quant = gguf.GGML_QUANT_SIZES.get(self.dtype)
assert quant is not None, 'Unknown tensor type' assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant (blksize, tysize) = quant
offset += 12 offset += 12
self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len]) self.name = bytes(data[offset:offset + name_len])

View File

@ -199,10 +199,13 @@ class LoraTorchTensor:
kwargs = {} kwargs = {}
if func is torch.permute: if func is torch.permute:
assert len(args)
return type(args[0]).permute(*args, **kwargs) return type(args[0]).permute(*args, **kwargs)
elif func is torch.reshape: elif func is torch.reshape:
assert len(args)
return type(args[0]).reshape(*args, **kwargs) return type(args[0]).reshape(*args, **kwargs)
elif func is torch.stack: elif func is torch.stack:
assert len(args)
assert isinstance(args[0], Sequence) assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0) dim = kwargs.get("dim", 0)
assert dim == 0 assert dim == 0
@ -211,6 +214,7 @@ class LoraTorchTensor:
torch.stack([b._lora_B for b in args[0]], dim), torch.stack([b._lora_B for b in args[0]], dim),
) )
elif func is torch.cat: elif func is torch.cat:
assert len(args)
assert isinstance(args[0], Sequence) assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0) dim = kwargs.get("dim", 0)
assert dim == 0 assert dim == 0
@ -362,7 +366,7 @@ if __name__ == '__main__':
logger.error(f"Model {hparams['architectures'][0]} is not supported") logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1) sys.exit(1)
class LoraModel(model_class): class LoraModel(model_class): # ty: ignore[unsupported-base]
model_arch = model_class.model_arch model_arch = model_class.model_arch
lora_alpha: float lora_alpha: float

View File

@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
return f'({result})?' if min_items == 0 else result return f'({result})?' if min_items == 0 else result
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
has_min = min_value != None
has_max = max_value != None
def digit_range(from_char: str, to_char: str): def digit_range(from_char: str, to_char: str):
out.append("[") out.append("[")
if from_char == to_char: if from_char == to_char:
@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
out.append(to_str[i]) out.append(to_str[i])
out.append("]") out.append("]")
if has_min and has_max: if min_value is not None and max_value is not None:
if min_value < 0 and max_value < 0: if min_value < 0 and max_value < 0:
out.append("\"-\" (") out.append("\"-\" (")
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
less_decimals = max(decimals_left - 1, 1) less_decimals = max(decimals_left - 1, 1)
if has_min: if min_value is not None:
if min_value < 0: if min_value < 0:
out.append("\"-\" (") out.append("\"-\" (")
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
more_digits(length - 1, less_decimals) more_digits(length - 1, less_decimals)
return return
if has_max: if max_value is not None:
if max_value >= 0: if max_value >= 0:
if top_level: if top_level:
out.append("\"-\" [1-9] ") out.append("\"-\" [1-9] ")

View File

@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
print("Using SentenceTransformer to apply all numbered layers") print("Using SentenceTransformer to apply all numbered layers")
model = SentenceTransformer(model_path) model = SentenceTransformer(model_path)
tokenizer = model.tokenizer tokenizer = model.tokenizer
config = model[0].auto_model.config # type: ignore config = model[0].auto_model.config
else: else:
tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
print(f"Model file: {type(model).__module__}") print(f"Model file: {type(model).__module__}")
# Verify the model is using the correct sliding window # Verify the model is using the correct sliding window
if hasattr(model.config, 'sliding_window'): # type: ignore if hasattr(model.config, 'sliding_window'):
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore print(f"Model's sliding_window: {model.config.sliding_window}")
else: else:
print("Model config does not have sliding_window attribute") print("Model config does not have sliding_window attribute")
@ -152,7 +152,7 @@ def main():
device = next(model.parameters()).device device = next(model.parameters()).device
else: else:
# For SentenceTransformer, get device from the underlying model # For SentenceTransformer, get device from the underlying model
device = next(model[0].auto_model.parameters()).device # type: ignore device = next(model[0].auto_model.parameters()).device
model_name = os.path.basename(model_path) model_name = os.path.basename(model_path)
@ -177,7 +177,7 @@ def main():
print(f"{token_id:6d} -> '{token_str}'") print(f"{token_id:6d} -> '{token_str}'")
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}") print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")
else: else:
# Standard approach: use base model output only # Standard approach: use base model output only
encoded = tokenizer( encoded = tokenizer(
@ -205,12 +205,12 @@ def main():
print(f"Embedding dimension: {all_embeddings.shape[1]}") print(f"Embedding dimension: {all_embeddings.shape[1]}")
if len(all_embeddings.shape) == 1: if len(all_embeddings.shape) == 1:
n_embd = all_embeddings.shape[0] # type: ignore n_embd = all_embeddings.shape[0]
n_embd_count = 1 n_embd_count = 1
all_embeddings = all_embeddings.reshape(1, -1) all_embeddings = all_embeddings.reshape(1, -1)
else: else:
n_embd = all_embeddings.shape[1] # type: ignore n_embd = all_embeddings.shape[1]
n_embd_count = all_embeddings.shape[0] # type: ignore n_embd_count = all_embeddings.shape[0]
print() print()

View File

@ -2,7 +2,7 @@
import argparse import argparse
import sys import sys
from common import compare_tokens # type: ignore from common import compare_tokens # type: ignore[import-not-found]
def parse_arguments(): def parse_arguments():

View File

@ -6,7 +6,7 @@ import re
from copy import copy from copy import copy
from enum import Enum from enum import Enum
from inspect import getdoc, isclass from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
from docstring_parser import parse from docstring_parser import parse
from pydantic import BaseModel, create_model from pydantic import BaseModel, create_model
@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
# Assert that the parameter has a type annotation # Assert that the parameter has a type annotation
if param.annotation == inspect.Parameter.empty: if param.annotation == inspect.Parameter.empty:
raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation") raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""")
# Find the parameter's description in the docstring # Find the parameter's description in the docstring
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
# Assert that the parameter has a description # Assert that the parameter has a description
if not param_doc or not param_doc.description: if not param_doc or not param_doc.description:
raise ValueError( raise ValueError(
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""")
# Add parameter details to the schema # Add parameter details to the schema
param_docs.append((param.name, param_doc)) param_docs.append((param.name, param_doc))
@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
dynamic_fields[param.name] = ( dynamic_fields[param.name] = (
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
# Creating the dynamic model # Creating the dynamic model
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields)
for name, param_doc in param_docs: for name, param_doc in param_docs:
dynamic_model.model_fields[name].description = param_doc.description dynamic_model.model_fields[name].description = param_doc.description
@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
if items != {}: if items != {}:
array = {"properties": items} array = {"properties": items}
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
fields[field_name] = (List[array_type], ...) fields[field_name] = (list[array_type], ...) # ty: ignore[invalid-type-form]
else: else:
fields[field_name] = (list, ...) fields[field_name] = (list, ...)
elif field_type == "object": elif field_type == "object":

View File

@ -3194,6 +3194,7 @@ class tinyBLAS_PPC {
private: private:
__attribute__((always_inline))
inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) { inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
vec_t vec_C[4]; vec_t vec_C[4];
__builtin_mma_disassemble_acc(vec_C, ACC); __builtin_mma_disassemble_acc(vec_C, ACC);
@ -3204,6 +3205,7 @@ class tinyBLAS_PPC {
} }
} }
__attribute__((always_inline))
inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) { inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
vec_t vec_C[4]; vec_t vec_C[4];
__builtin_mma_disassemble_acc(vec_C, ACC); __builtin_mma_disassemble_acc(vec_C, ACC);

View File

@ -1162,12 +1162,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
return nullptr; return nullptr;
} }
// Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types)
if (ggml_blck_size((enum ggml_type)tensor->type) == 0) {
GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type);
return nullptr;
}
ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
// ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
if (result == nullptr) { if (result == nullptr) {
GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type); GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type);
return nullptr; return nullptr;
} }

View File

@ -16048,6 +16048,7 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
case 0xE20C: // B570 case 0xE20C: // B570
return 18; return 18;
case 0xE20B: // B580 case 0xE20B: // B580
case 0xE211: // Pro B60
return 20; return 20;
default: default:
return 0; return 0;

View File

@ -3869,6 +3869,8 @@ class LlamaFileType(IntEnum):
# MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ1_0 = 36 # except 1d tensors
MOSTLY_TQ2_0 = 37 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors
MOSTLY_MXFP4_MOE = 38 # except 1d tensors
MOSTLY_NVFP4 = 39 # except 1d tensors
GUESSED = 1024 # not specified in the model file GUESSED = 1024 # not specified in the model file

View File

@ -1300,7 +1300,7 @@ class GGUFWriter:
else: else:
raise ValueError("Invalid GGUF metadata value type or value") raise ValueError("Invalid GGUF metadata value type or value")
return kv_data return bytes(kv_data)
@staticmethod @staticmethod
def format_n_bytes_to_str(num: int) -> str: def format_n_bytes_to_str(num: int) -> str:

View File

@ -138,7 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
if isinstance(meta_noop, tuple): if isinstance(meta_noop, tuple):
dtype, shape = meta_noop dtype, shape = meta_noop
assert callable(shape) assert callable(shape)
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) # ty: ignore[call-top-callable]
else: else:
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)

View File

@ -91,11 +91,11 @@ class __Quant(ABC):
def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
cls.qtype = qtype cls.qtype = qtype
cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( cls.__quantize_lazy: Any = LazyNumpyTensor._wrap_fn(
cls.__quantize_array, cls.__quantize_array,
meta_noop=(np.uint8, cls.__shape_to_bytes) meta_noop=(np.uint8, cls.__shape_to_bytes)
) )
cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( cls.__dequantize_lazy: Any = LazyNumpyTensor._wrap_fn(
cls.__dequantize_array, cls.__dequantize_array,
meta_noop=(np.float32, cls.__shape_from_bytes) meta_noop=(np.float32, cls.__shape_from_bytes)
) )

View File

@ -11,33 +11,33 @@ from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVa
try: try:
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
except ImportError: except ImportError:
SentencePieceProcessor = None SentencePieceProcessor: Any = None
try: try:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
_filter_valid_tokenizer_files, _filter_valid_tokenizer_files,
) )
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
SentencePieceTokenizer, SentencePieceTokenizer,
) )
except ImportError: except ImportError:
_mistral_common_installed = False _mistral_common_installed = False
MistralTokenizer = None MistralTokenizer: Any = None
Tekkenizer = None Tekkenizer: Any = None
SentencePieceTokenizer = None SentencePieceTokenizer: Any = None
_filter_valid_tokenizer_files = None _filter_valid_tokenizer_files: Any = None
else: else:
_mistral_common_installed = True _mistral_common_installed = True
try: try:
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
get_one_valid_tokenizer_file, get_one_valid_tokenizer_file,
) )
except ImportError: except ImportError:
# We still want the conversion to work with older mistral-common versions. # We still want the conversion to work with older mistral-common versions.
get_one_valid_tokenizer_file = None get_one_valid_tokenizer_file: Any = None
import gguf import gguf
@ -703,7 +703,7 @@ class MistralVocab(Vocab):
tokenizer_file_path = base_path / tokenizer_file tokenizer_file_path = base_path / tokenizer_file
self.tokenizer = MistralTokenizer.from_file( self.tokenizer: Any = MistralTokenizer.from_file(
tokenizer_file_path tokenizer_file_path
).instruct_tokenizer.tokenizer ).instruct_tokenizer.tokenizer
self.tokenizer_type = ( self.tokenizer_type = (

View File

@ -1,5 +1,5 @@
{ {
"extraPaths": ["gguf-py", "examples/model-conversion/scripts"], "extraPaths": ["gguf-py", "examples/model-conversion/scripts", "examples/model-conversion/scripts/utils"],
"pythonVersion": "3.9", "pythonVersion": "3.9",
"pythonPlatform": "All", "pythonPlatform": "All",
"reportUnusedImport": "warning", "reportUnusedImport": "warning",

View File

@ -684,6 +684,7 @@ else:
sys.exit(1) sys.exit(1)
assert isinstance(hexsha8_baseline, str)
name_baseline = bench_data.get_commit_name(hexsha8_baseline) name_baseline = bench_data.get_commit_name(hexsha8_baseline)
hexsha8_compare = name_compare = None hexsha8_compare = name_compare = None
@ -717,6 +718,7 @@ else:
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
assert isinstance(hexsha8_compare, str)
name_compare = bench_data.get_commit_name(hexsha8_compare) name_compare = bench_data.get_commit_name(hexsha8_compare)
# Get tool-specific configuration # Get tool-specific configuration

View File

@ -241,10 +241,10 @@ class CodeEditor(QPlainTextEdit):
if not self.isReadOnly(): if not self.isReadOnly():
selection = QTextEdit.ExtraSelection() selection = QTextEdit.ExtraSelection()
line_color = QColorConstants.Yellow.lighter(160) line_color = QColorConstants.Yellow.lighter(160)
selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
extra_selections.append(selection) extra_selections.append(selection)
self.setExtraSelections(extra_selections) self.setExtraSelections(extra_selections)
@ -262,8 +262,8 @@ class CodeEditor(QPlainTextEdit):
) )
extra = QTextEdit.ExtraSelection() extra = QTextEdit.ExtraSelection()
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
self.setExtraSelections(self.extraSelections() + [extra]) self.setExtraSelections(self.extraSelections() + [extra])
@ -274,8 +274,8 @@ class CodeEditor(QPlainTextEdit):
cursor.select(QTextCursor.SelectionType.LineUnderCursor) cursor.select(QTextCursor.SelectionType.LineUnderCursor)
extra = QTextEdit.ExtraSelection() extra = QTextEdit.ExtraSelection()
extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
self.setExtraSelections(self.extraSelections() + [extra]) self.setExtraSelections(self.extraSelections() + [extra])
@ -395,8 +395,8 @@ class JinjaTester(QMainWindow):
ensure_ascii=ensure_ascii, ensure_ascii=ensure_ascii,
) )
) )
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment]
env.globals["raise_exception"] = raise_exception env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment]
try: try:
template = env.from_string(template_str) template = env.from_string(template_str)
output = template.render(context) output = template.render(context)

View File

@ -189,6 +189,7 @@ def benchmark(
data: list[dict] = [] data: list[dict] = []
assert isinstance(prompts, list)
for i, p in enumerate(prompts): for i, p in enumerate(prompts):
if seed_offset >= 0: if seed_offset >= 0:
random.seed(3 * (seed_offset + 1000 * i) + 1) random.seed(3 * (seed_offset + 1000 * i) + 1)

View File

@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id]; const int32_t seq_idx = ubatch.seq_idx[seq_id];
embd_seq_out[seq_id].resize(n_embd); // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
const uint32_t n_embd_out = hparams.n_embd_out();
embd_seq_out[seq_id].resize(n_embd_out);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
} }
} break; } break;
case LLAMA_POOLING_TYPE_RANK: case LLAMA_POOLING_TYPE_RANK:
@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
// extract sequence embeddings (cleared before processing each batch) // extract sequence embeddings (cleared before processing each batch)
auto & embd_seq_out = embd_seq; auto & embd_seq_out = embd_seq;
// use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
// output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
const uint32_t n_embd_out = hparams.n_embd_out();
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id]; const int32_t seq_idx = ubatch.seq_idx[seq_id];
embd_seq_out[seq_id].resize(n_embd); embd_seq_out[seq_id].resize(n_embd_out);
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
} }
} break; } break;
case LLAMA_POOLING_TYPE_RANK: case LLAMA_POOLING_TYPE_RANK:

View File

@ -7,6 +7,7 @@
#include <cmath> #include <cmath>
#include <algorithm> #include <algorithm>
#include <cstdint> #include <cstdint>
#include <set>
#include <stdexcept> #include <stdexcept>
#define MAX_REPETITION_THRESHOLD 2000 #define MAX_REPETITION_THRESHOLD 2000
@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence(
bool is_nested) { bool is_nested) {
size_t last_sym_start = rule.size(); size_t last_sym_start = rule.size();
const char * pos = src; const char * pos = src;
uint64_t n_prev_rules = 1;
// use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
// (though it's technically the same as -1 now) // (though it's technically the same as -1 now)
@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence(
// S' ::= S | // S' ::= S |
llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end()); llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
// Calculate the total number of rules that will be generated by this repetition
uint64_t total_rules = 1; // Start with 1 for the original rule
if (!no_max && max_times > 0) {
total_rules = max_times;
} else if (min_times > 0) {
total_rules = min_times;
}
if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) {
throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity");
}
if (min_times == 0) { if (min_times == 0) {
rule.resize(last_sym_start); rule.resize(last_sym_start);
} else { } else {
@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence(
if (n_opt > 0) { if (n_opt > 0) {
rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
} }
n_prev_rules *= total_rules;
GGML_ASSERT(n_prev_rules >= 1);
}; };
while (*pos) { while (*pos) {
if (*pos == '"') { // literal string if (*pos == '"') { // literal string
pos++; pos++;
last_sym_start = rule.size(); last_sym_start = rule.size();
n_prev_rules = 1;
while (*pos != '"') { while (*pos != '"') {
if (!*pos) { if (!*pos) {
throw std::runtime_error("unexpected end of input"); throw std::runtime_error("unexpected end of input");
@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence(
start_type = LLAMA_GRETYPE_CHAR_NOT; start_type = LLAMA_GRETYPE_CHAR_NOT;
} }
last_sym_start = rule.size(); last_sym_start = rule.size();
n_prev_rules = 1;
while (*pos != ']') { while (*pos != ']') {
if (!*pos) { if (!*pos) {
throw std::runtime_error("unexpected end of input"); throw std::runtime_error("unexpected end of input");
@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence(
auto token_pair = parse_token(vocab, pos); auto token_pair = parse_token(vocab, pos);
const char * token_end = token_pair.second; const char * token_end = token_pair.second;
last_sym_start = rule.size(); last_sym_start = rule.size();
n_prev_rules = 1;
rule.push_back({type, token_pair.first}); rule.push_back({type, token_pair.first});
pos = parse_space(token_end, is_nested); pos = parse_space(token_end, is_nested);
} else if (is_word_char(*pos)) { // rule reference } else if (is_word_char(*pos)) { // rule reference
@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence(
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos); uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
pos = parse_space(name_end, is_nested); pos = parse_space(name_end, is_nested);
last_sym_start = rule.size(); last_sym_start = rule.size();
n_prev_rules = 1;
rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping } else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule // parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true); pos = parse_space(pos + 1, true);
uint32_t n_rules_before = symbol_ids.size();
uint32_t sub_rule_id = generate_symbol_id(rule_name); uint32_t sub_rule_id = generate_symbol_id(rule_name);
pos = parse_alternates(pos, rule_name, sub_rule_id, true); pos = parse_alternates(pos, rule_name, sub_rule_id, true);
n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before);
last_sym_start = rule.size(); last_sym_start = rule.size();
// output reference to synthesized rule // output reference to synthesized rule
rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence(
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '.') { // any char } else if (*pos == '.') { // any char
last_sym_start = rule.size(); last_sym_start = rule.size();
n_prev_rules = 1;
rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested); pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') { } else if (*pos == '*') {
@ -830,32 +853,54 @@ static bool llama_grammar_match_token(
static void llama_grammar_advance_stack( static void llama_grammar_advance_stack(
const llama_grammar_rules & rules, const llama_grammar_rules & rules,
const llama_grammar_stack & stack, const llama_grammar_stack & stack,
llama_grammar_stacks & new_stacks) { llama_grammar_stacks & new_stacks) {
if (stack.empty()) { std::vector<llama_grammar_stack> todo;
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { todo.push_back(stack);
new_stacks.emplace_back(stack);
auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) {
return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(),
[](const llama_grammar_element * pa, const llama_grammar_element * pb) {
return pa < pb; // Compare pointer addresses
}
);
};
std::set<llama_grammar_stack, decltype(stack_cmp)> seen(stack_cmp);
while (!todo.empty()) {
llama_grammar_stack curr_stack = std::move(todo.back());
todo.pop_back();
if (seen.find( curr_stack) != seen.end()) {
continue;
} }
return; seen.insert(curr_stack);
}
const llama_grammar_element * pos = stack.back(); if (curr_stack.empty()) {
if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
new_stacks.emplace_back(std::move(curr_stack));
}
continue;
}
switch (pos->type) { const llama_grammar_element * pos = curr_stack.back();
switch (pos->type) {
case LLAMA_GRETYPE_RULE_REF: { case LLAMA_GRETYPE_RULE_REF: {
const size_t rule_id = static_cast<size_t>(pos->value); const size_t rule_id = static_cast<size_t>(pos->value);
const llama_grammar_element * subpos = rules[rule_id].data(); const llama_grammar_element * subpos = rules[rule_id].data();
do { do {
// init new stack without the top (pos) // init new stack without the top (pos)
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1); llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(pos + 1)) { if (!llama_grammar_is_end_of_sequence(pos + 1)) {
// if this rule ref is followed by another element, add that to stack // if this rule ref is followed by another element, add that to stack
new_stack.push_back(pos + 1); next_stack.push_back(pos + 1);
} }
if (!llama_grammar_is_end_of_sequence(subpos)) { if (!llama_grammar_is_end_of_sequence(subpos)) {
// if alternate is nonempty, add to stack // if alternate is nonempty, add to stack
new_stack.push_back(subpos); next_stack.push_back(subpos);
} }
llama_grammar_advance_stack(rules, new_stack, new_stacks); todo.push_back(std::move(next_stack));
while (!llama_grammar_is_end_of_sequence(subpos)) { while (!llama_grammar_is_end_of_sequence(subpos)) {
// scan to end of alternate def // scan to end of alternate def
subpos++; subpos++;
@ -874,9 +919,9 @@ static void llama_grammar_advance_stack(
case LLAMA_GRETYPE_CHAR_ANY: case LLAMA_GRETYPE_CHAR_ANY:
case LLAMA_GRETYPE_TOKEN: case LLAMA_GRETYPE_TOKEN:
case LLAMA_GRETYPE_TOKEN_NOT: case LLAMA_GRETYPE_TOKEN_NOT:
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
// only add the stack if it's not a duplicate of one we already have // only add the stack if it's not a duplicate of one we already have
new_stacks.emplace_back(stack); new_stacks.emplace_back(std::move(curr_stack));
} }
break; break;
default: default:
@ -884,6 +929,7 @@ static void llama_grammar_advance_stack(
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those // those
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
}
} }
} }

View File

@ -22,6 +22,7 @@ static void test_calculate_diff_split_no_common(testing & t);
static void test_calculate_diff_split_single_char(testing & t); static void test_calculate_diff_split_single_char(testing & t);
static void test_calculate_diff_split_overlaps(testing & t); static void test_calculate_diff_split_overlaps(testing & t);
static void test_calculate_diff_split_tag_boundaries(testing & t); static void test_calculate_diff_split_tag_boundaries(testing & t);
static void test_calculate_diff_split_generation_prompt(testing & t);
static void test_calculate_diff_split(testing & t); static void test_calculate_diff_split(testing & t);
static void test_until_common_prefix_basic(testing & t); static void test_until_common_prefix_basic(testing & t);
@ -179,6 +180,7 @@ static void test_calculate_diff_split(testing & t) {
t.test("calculate_diff_split single char", test_calculate_diff_split_single_char); t.test("calculate_diff_split single char", test_calculate_diff_split_single_char);
t.test("calculate_diff_split overlaps", test_calculate_diff_split_overlaps); t.test("calculate_diff_split overlaps", test_calculate_diff_split_overlaps);
t.test("calculate_diff_split tag boundaries", test_calculate_diff_split_tag_boundaries); t.test("calculate_diff_split tag boundaries", test_calculate_diff_split_tag_boundaries);
t.test("calculate_diff_split generation prompt", test_calculate_diff_split_generation_prompt);
} }
static void test_calculate_diff_split_basic(testing & t) { static void test_calculate_diff_split_basic(testing & t) {
@ -502,6 +504,39 @@ static void test_calculate_diff_split_tag_boundaries(testing & t) {
} }
} }
static void test_calculate_diff_split_generation_prompt(testing & t) {
// ChatML thinking template: left is a prefix of right, generation_prompt is the appended part.
// The trailing \n in left matches the trailing \n in the generation_prompt, causing
// the suffix matcher to steal it and rotate the diff result.
{
// Simplified reproduction: left ends with \n, right = left + "<|im_start|>assistant\n<think>\n"
std::string left = "<|im_start|>user\nHello<|im_end|>\n";
std::string right = left + "<|im_start|>assistant\n<think>\n";
diff_split result = calculate_diff_split(left, right);
t.assert_equal("chatml prefix", left, result.prefix);
t.assert_equal("chatml left", "", result.left);
t.assert_equal("chatml right should be generation prompt",
"<|im_start|>assistant\n<think>\n", result.right);
t.assert_equal("chatml suffix", "", result.suffix);
}
{
// More realistic: longer conversation ending with tool_response
std::string common =
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\nSearch for files<|im_end|>\n"
"<|im_start|>assistant\n<think>\nLet me search.\n</think>\n\n"
"<tool_call>\n<function=search>\n</function>\n</tool_call><|im_end|>\n"
"<|im_start|>user\n<tool_response>\nNo files found\n</tool_response><|im_end|>\n";
std::string left = common;
std::string right = common + "<|im_start|>assistant\n<think>\n";
diff_split result = calculate_diff_split(left, right);
t.assert_equal("tool_response left", "", result.left);
t.assert_equal("tool_response right should be generation prompt",
"<|im_start|>assistant\n<think>\n", result.right);
}
}
static void test_until_common_prefix(testing & t) { static void test_until_common_prefix(testing & t) {
t.test("until_common_prefix basic", test_until_common_prefix_basic); t.test("until_common_prefix basic", test_until_common_prefix_basic);
} }

View File

@ -1337,7 +1337,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?") tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.enable_thinking(true) .enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_NONE) .reasoning_format(COMMON_REASONING_FORMAT_NONE)
.expect_content("<think>I'm\nthinking\n</think>\nHello, world!\nWhat's up?") .expect_content("<think>\nI'm\nthinking\n</think>\nHello, world!\nWhat's up?")
.run(); .run();
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?") tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")

View File

@ -788,6 +788,24 @@ static void test_quantifiers() {
"0xFF 0x12 0xAB 0x00 0x00 0x00", "0xFF 0x12 0xAB 0x00 0x00 0x00",
} }
); );
test_grammar(
"segfault",
// Grammar
R"""(
root ::= ( [x]* )*
)""",
// Passing strings
{
"",
"x",
"xx"
},
// Failing strings
{
"y",
"yy"
}
);
} }
static void test_failure_missing_root() { static void test_failure_missing_root() {

View File

@ -145,6 +145,10 @@ int main()
root ::= "a"{,}" root ::= "a"{,}"
)"""); )""");
verify_failure(R"""(
root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99}
)""");
verify_failure(R"""( verify_failure(R"""(
root ::= "a"{,10}" root ::= "a"{,10}"
)"""); )""");

View File

@ -123,25 +123,27 @@ int main()
std::vector<std::vector<llama_grammar_element>> expected_stacks = { std::vector<std::vector<llama_grammar_element>> expected_stacks = {
{ {
{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97}, {LLAMA_GRETYPE_CHAR, 97},
}, },
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{ {
{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_CHAR, 61},
@ -149,26 +151,24 @@ int main()
{LLAMA_GRETYPE_CHAR, 40}, {LLAMA_GRETYPE_CHAR, 40},
}, },
{ {
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97}, {LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
}}; }};
auto index = 0; auto index = 0;
@ -195,9 +195,9 @@ int main()
} }
std::vector<llama_grammar_candidate> next_candidates; std::vector<llama_grammar_candidate> next_candidates;
next_candidates.resize(24); next_candidates.resize(23);
for (size_t i = 0; i < 24; ++i) for (size_t i = 0; i < 23; ++i)
{ {
uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
cp[0] = 37 + i; cp[0] = 37 + i;
@ -210,7 +210,6 @@ int main()
{0, 37}, {0, 37},
{1, 38}, {1, 38},
{2, 39}, {2, 39},
{3, 40},
{4, 41}, {4, 41},
{5, 42}, {5, 42},
{6, 43}, {6, 43},
@ -268,6 +267,7 @@ int main()
{0, 37}, {0, 37},
{1, 38}, {1, 38},
{2, 39}, {2, 39},
{3, 40},
{4, 41}, {4, 41},
{5, 42}, {5, 42},
{6, 43}, {6, 43},
@ -287,13 +287,11 @@ int main()
{20, 57}, {20, 57},
{21, 58}, {21, 58},
{22, 59}, {22, 59},
{23, 60},
}, },
{ {
{0, 37}, {0, 37},
{1, 38}, {1, 38},
{2, 39}, {2, 39},
{3, 40},
{4, 41}, {4, 41},
{5, 42}, {5, 42},
{6, 43}, {6, 43},
@ -351,6 +349,7 @@ int main()
{0, 37}, {0, 37},
{1, 38}, {1, 38},
{2, 39}, {2, 39},
{3, 40},
{4, 41}, {4, 41},
{5, 42}, {5, 42},
{6, 43}, {6, 43},
@ -370,7 +369,6 @@ int main()
{20, 57}, {20, 57},
{21, 58}, {21, 58},
{22, 59}, {22, 59},
{23, 60},
}, },
}; };

View File

@ -16,8 +16,7 @@ import random
import unicodedata import unicodedata
from pathlib import Path from pathlib import Path
from typing import Any, Iterator, cast from typing import Any, Iterator
from typing_extensions import Buffer
import cffi import cffi
from transformers import AutoTokenizer, PreTrainedTokenizer from transformers import AutoTokenizer, PreTrainedTokenizer
@ -114,7 +113,7 @@ class LibLlamaModel:
while num < 0 and len(self.text_buff) < (16 << 20): while num < 0 and len(self.text_buff) < (16 << 20):
self.text_buff = self.ffi.new("uint8_t[]", -2 * num) self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType]
class Tokenizer: class Tokenizer:
@ -438,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
decode_errors = 0 decode_errors = 0
MAX_ERRORS = 10 MAX_ERRORS = 10
logger.info("%s: %s" % (generator.__qualname__, "ini")) logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini"))
for text in generator: for text in generator:
# print(repr(text), text.encode()) # print(repr(text), text.encode())
# print(repr(text), hex(ord(text[0])), text.encode()) # print(repr(text), hex(ord(text[0])), text.encode())
@ -477,7 +476,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
break break
t_total = time.perf_counter() - t_start t_total = time.perf_counter() - t_start
logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") logger.info(f"{getattr(generator, '__qualname__', '')}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
def main(argv: list[str] | None = None): def main(argv: list[str] | None = None):

View File

@ -83,7 +83,7 @@
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) | | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) | | `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) | | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) | | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |

View File

@ -166,7 +166,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) | | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) | | `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) | | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) | | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |

View File

@ -418,7 +418,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n"); printf(" -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"); printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n"); printf(" example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n");
printf(" (default: unused)\n"); printf(" (default: unused)\n");
printf(" -hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo\n"); printf(" -hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
printf(" (default: unused)\n"); printf(" (default: unused)\n");

View File

@ -100,7 +100,7 @@ For the full list of features, please refer to [server's changelog](https://gith
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) | | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) | | `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) | | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) | | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |

View File

@ -285,7 +285,7 @@ def start_server_background(args):
} }
server_process = subprocess.Popen( server_process = subprocess.Popen(
args, args,
**pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] # ty: ignore[no-matching-overload]
def server_log(in_stream, out_stream): def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b''): for line in iter(in_stream.readline, b''):

View File

@ -415,6 +415,7 @@ task_params server_task::params_from_json_cmpl(
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string()); params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
params.sampling.generation_prompt = params.chat_parser_params.generation_prompt; params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str());
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false); params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) { if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>()); params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());

View File

@ -9,6 +9,7 @@ sys.path.insert(0, str(path))
from utils import * from utils import *
from enum import Enum from enum import Enum
from typing import TypedDict
server: ServerProcess server: ServerProcess
@ -29,56 +30,73 @@ class CompletionMode(Enum):
NORMAL = "normal" NORMAL = "normal"
STREAMED = "streamed" STREAMED = "streamed"
TEST_TOOL = { class ToolParameters(TypedDict):
"type":"function", type: str
"function": { properties: dict[str, dict]
"name": "test", required: list[str]
"description": "",
"parameters": {
"type": "object",
"properties": {
"success": {"type": "boolean", "const": True},
},
"required": ["success"]
}
}
}
PYTHON_TOOL = { class ToolFunction(TypedDict):
"type": "function", name: str
"function": { description: str
"name": "python", parameters: ToolParameters
"description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
"parameters": { class ToolDefinition(TypedDict):
"type": "object", type: str
"properties": { function: ToolFunction
TEST_TOOL = ToolDefinition(
type = "function",
function = ToolFunction(
name = "test",
description = "",
parameters = ToolParameters(
type = "object",
properties = {
"success": {
"type": "boolean",
"const": True,
},
},
required = ["success"],
),
),
)
PYTHON_TOOL = ToolDefinition(
type = "function",
function = ToolFunction(
name = "python",
description = "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
parameters = ToolParameters(
type = "object",
properties = {
"code": { "code": {
"type": "string", "type": "string",
"description": "The code to run in the ipython interpreter." "description": "The code to run in the ipython interpreter.",
} },
}, },
"required": ["code"] required = ["code"],
} ),
} ),
} )
WEATHER_TOOL = { WEATHER_TOOL = ToolDefinition(
"type":"function", type = "function",
"function":{ function = ToolFunction(
"name":"get_current_weather", name = "get_current_weather",
"description":"Get the current weather in a given location", description = "Get the current weather in a given location",
"parameters":{ parameters = ToolParameters(
"type":"object", type = "object",
"properties":{ properties = {
"location":{ "location": {
"type":"string", "type": "string",
"description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'" "description": "The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'",
} },
}, },
"required":["location"] required = ["location"],
} ),
} ),
} )
def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs): def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
body = server.make_any_request("POST", "/v1/chat/completions", data={ body = server.make_any_request("POST", "/v1/chat/completions", data={

View File

@ -127,7 +127,7 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
fullHeightCodeBlocks: fullHeightCodeBlocks:
'Always display code blocks at their full natural height, overriding any height limits.', 'Always display code blocks at their full natural height, overriding any height limits.',
showRawModelNames: showRawModelNames:
'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.', 'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.',
mcpServers: mcpServers:
'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.', 'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.',
mcpServerUsageStats: mcpServerUsageStats:

View File

@ -457,7 +457,7 @@ class ModelsStore {
/** /**
* Select a model by its model name (used for syncing with conversation model) * Select a model by its model name (used for syncing with conversation model)
* @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest") * @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF")
*/ */
selectModelByName(modelName: string): void { selectModelByName(modelName: string): void {
const option = this.models.find((model) => model.model === modelName); const option = this.models.find((model) => model.model === modelName);

30
ty.toml Normal file
View File

@ -0,0 +1,30 @@
[environment]
extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
python-version = "3.10"
[rules]
deprecated = "warn"
[src]
exclude = [
"./tools/mtmd/legacy-models/**",
]
[[overrides]]
include = [
"./tools/server/tests/**",
]
[overrides.rules]
unresolved-reference = "ignore"
unresolved-import = "ignore"
unresolved-attribute = "ignore"
[[overrides]]
include = [
"./examples/pydantic_models_to_grammar.py",
]
[overrides.rules]
unsupported-operator = "ignore"
not-subscriptable = "ignore"