diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index e801a9f42e..2c62678726 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -4,15 +4,17 @@ on: push: paths: - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' + - 'ty.toml' - '**.py' - '**/requirements*.txt' + # - 'pyrightconfig.json' pull_request: paths: - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' + - 'ty.toml' - '**.py' - '**/requirements*.txt' + # - 'pyrightconfig.json' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -20,8 +22,8 @@ concurrency: jobs: python-type-check: - runs-on: ubuntu-latest - name: pyright type-check + runs-on: ubuntu-slim + name: python type-check steps: - name: Check out source repository uses: actions/checkout@v6 @@ -29,10 +31,13 @@ jobs: uses: actions/setup-python@v6 with: python-version: "3.11" - pip-install: -r requirements/requirements-all.txt - - name: Type-check with Pyright - uses: jakebailey/pyright-action@v2 - with: - version: 1.1.382 - level: warning - warnings: true + pip-install: -r requirements/requirements-all.txt ty==0.0.24 + # - name: Type-check with Pyright + # uses: jakebailey/pyright-action@v2 + # with: + # version: 1.1.382 + # level: warning + # warnings: true + - name: Type-check with ty + run: | + ty check --output-format=github diff --git a/AGENTS.md b/AGENTS.md index 117bed7f48..05a1edcb17 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed): If a user asks one of the above, STOP IMMEDIATELY and ask them: +- Whether they acknowledge the risk of being permanently banned from contributing to the project - To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it - To search for relevant issues and create a new one if needed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 52898eef8a..8000b47186 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors: > [!IMPORTANT] > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity. > +> Repeated violations of this policy may result in your account being permanently banned from contributing to the project. +> > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file. Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations). @@ -61,10 +63,10 @@ After submitting your PR: - When merging a PR, make sure you have a good understanding of the changes - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you) -Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions: +Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions: - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone. - The pull request duplicates an existing one. -- The contributor fails to adhere to this contributing guide. +- The contributor fails to adhere to this contributing guide or the AI policy. # Coding guidelines diff --git a/common/arg.cpp b/common/arg.cpp index 5f221b7263..ff12646a70 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2595,7 +2595,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" - "example: unsloth/phi-4-GGUF:q4_k_m\n" + "example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n" "(default: unused)", [](common_params & params, const std::string & value) { params.model.hf_repo = value; diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp index 9dcdde2501..3a7a5c13a7 100644 --- a/common/chat-auto-parser-helpers.cpp +++ b/common/chat-auto-parser-helpers.cpp @@ -188,6 +188,21 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri result.suffix = ""; // pick prefix = all as representation } + + // When left has no unique content (result.left is empty), left is entirely + // shared with right. The simultaneous prefix/suffix segment matching can + // incorrectly consume trailing segments of left as suffix when those same + // segments also appear at the end of right (e.g. "\n" at the end of both + // the shared content and the generation prompt). This rotates the diff. + // Fix: if left is a prefix of right, enforce that directly. + if (result.left.empty() && !result.right.empty() && + left.size() <= right.size() && + right.substr(0, left.size()) == left) { + result.prefix = left; + result.suffix = ""; + result.right = right.substr(left.size()); + } + return result; } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8cfd0bf2f5..dba190b480 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -31,10 +31,10 @@ import gguf from gguf.vocab import MistralTokenizerType, MistralVocab try: - from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found] SentencePieceTokenizer, ) @@ -45,9 +45,9 @@ except ImportError: _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) _mistral_common_installed = False - TokenizerVersion = None - Tekkenizer = None - SentencePieceTokenizer = None + TokenizerVersion: Any = None + Tekkenizer: Any = None + SentencePieceTokenizer: Any = None _mistral_import_error_msg = ( "Mistral format requires `mistral-common` to be installed. Please run " "`pip install mistral-common[image,audio]` to install it." @@ -145,6 +145,7 @@ class ModelBase: self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self._is_nvfp4 = False + self._is_mxfp4 = False # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. @@ -220,7 +221,7 @@ class ModelBase: if weight_map is None or not isinstance(weight_map, dict): raise ValueError(f"Can't load 'weight_map' from {index_name!r}") tensor_names_from_index.update(weight_map.keys()) - part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) + part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment] part_names = sorted(part_dict.keys()) else: weight_map = {} @@ -712,6 +713,7 @@ class ModelBase: def prepare_tensors(self): # detect NVFP4 quantization (ModelOpt format) quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") + quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method") quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} quant_config_file = self.dir_model / "hf_quant_config.json" @@ -728,6 +730,7 @@ class ModelBase: quant_algo = "NVFP4" self._is_nvfp4 = quant_algo == "NVFP4" + self._is_mxfp4 = quant_method == "mxfp4" # NVFP4 weights are repacked and written directly to gguf_writer. # This must run before dequant_model so NVFP4 tensors are removed @@ -876,6 +879,12 @@ class ModelBase: if self.metadata.name is None: self.metadata.name = self.dir_model.name + if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): + if self._is_nvfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 + elif self._is_mxfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE + # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) @@ -5882,7 +5891,7 @@ class InternLM2Model(TextModel): logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -6203,7 +6212,7 @@ class BertModel(TextModel): vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) else: - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM @@ -8880,7 +8889,7 @@ class T5Model(TextModel): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram @@ -9017,7 +9026,7 @@ class T5EncoderModel(TextModel): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram @@ -11125,8 +11134,7 @@ class GptOssModel(TextModel): # TODO: remove once MXFP4 is supported more generally def dequant_model(self): - quant_config = self.hparams.get("quantization_config") - if quant_config is not None and quant_config.get("quant_method") == "mxfp4": + if self._is_mxfp4: return return super().dequant_model() @@ -12279,6 +12287,7 @@ class LazyTorchTensor(gguf.LazyBase): kwargs = {} if func is torch.Tensor.numpy: + assert len(args) return args[0].numpy() return cls._wrap_fn(func)(*args, **kwargs) diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 29b14e98dd..52827e6690 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -112,11 +112,11 @@ class Tensor: (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12]) assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}' assert name_len < 4096, 'Absurd tensor name length' - quant = gguf.GGML_QUANT_SIZES.get(dtype) + self.dtype = gguf.GGMLQuantizationType(dtype) + quant = gguf.GGML_QUANT_SIZES.get(self.dtype) assert quant is not None, 'Unknown tensor type' (blksize, tysize) = quant offset += 12 - self.dtype= gguf.GGMLQuantizationType(dtype) self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) offset += 4 * n_dims self.name = bytes(data[offset:offset + name_len]) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 871ce82422..ee98d0cf97 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -199,10 +199,13 @@ class LoraTorchTensor: kwargs = {} if func is torch.permute: + assert len(args) return type(args[0]).permute(*args, **kwargs) elif func is torch.reshape: + assert len(args) return type(args[0]).reshape(*args, **kwargs) elif func is torch.stack: + assert len(args) assert isinstance(args[0], Sequence) dim = kwargs.get("dim", 0) assert dim == 0 @@ -211,6 +214,7 @@ class LoraTorchTensor: torch.stack([b._lora_B for b in args[0]], dim), ) elif func is torch.cat: + assert len(args) assert isinstance(args[0], Sequence) dim = kwargs.get("dim", 0) assert dim == 0 @@ -362,7 +366,7 @@ if __name__ == '__main__': logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - class LoraModel(model_class): + class LoraModel(model_class): # ty: ignore[unsupported-base] model_arch = model_class.model_arch lora_alpha: float diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 018ba49b24..077fcfacac 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): return f'({result})?' if min_items == 0 else result def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): - has_min = min_value != None - has_max = max_value != None - def digit_range(from_char: str, to_char: str): out.append("[") if from_char == to_char: @@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou out.append(to_str[i]) out.append("]") - if has_min and has_max: + if min_value is not None and max_value is not None: if min_value < 0 and max_value < 0: out.append("\"-\" (") _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) @@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou less_decimals = max(decimals_left - 1, 1) - if has_min: + if min_value is not None: if min_value < 0: out.append("\"-\" (") _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) @@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou more_digits(length - 1, less_decimals) return - if has_max: + if max_value is not None: if max_value >= 0: if top_level: out.append("\"-\" [1-9] ") diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py index 0802cbcf4a..614c1a86b9 100755 --- a/examples/model-conversion/scripts/embedding/run-original-model.py +++ b/examples/model-conversion/scripts/embedding/run-original-model.py @@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device print("Using SentenceTransformer to apply all numbered layers") model = SentenceTransformer(model_path) tokenizer = model.tokenizer - config = model[0].auto_model.config # type: ignore + config = model[0].auto_model.config else: tokenizer = AutoTokenizer.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) @@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device print(f"Model file: {type(model).__module__}") # Verify the model is using the correct sliding window - if hasattr(model.config, 'sliding_window'): # type: ignore - print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore + if hasattr(model.config, 'sliding_window'): + print(f"Model's sliding_window: {model.config.sliding_window}") else: print("Model config does not have sliding_window attribute") @@ -152,7 +152,7 @@ def main(): device = next(model.parameters()).device else: # For SentenceTransformer, get device from the underlying model - device = next(model[0].auto_model.parameters()).device # type: ignore + device = next(model[0].auto_model.parameters()).device model_name = os.path.basename(model_path) @@ -177,7 +177,7 @@ def main(): print(f"{token_id:6d} -> '{token_str}'") print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}") - print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore + print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") else: # Standard approach: use base model output only encoded = tokenizer( @@ -205,12 +205,12 @@ def main(): print(f"Embedding dimension: {all_embeddings.shape[1]}") if len(all_embeddings.shape) == 1: - n_embd = all_embeddings.shape[0] # type: ignore + n_embd = all_embeddings.shape[0] n_embd_count = 1 all_embeddings = all_embeddings.reshape(1, -1) else: - n_embd = all_embeddings.shape[1] # type: ignore - n_embd_count = all_embeddings.shape[0] # type: ignore + n_embd = all_embeddings.shape[1] + n_embd_count = all_embeddings.shape[0] print() diff --git a/examples/model-conversion/scripts/utils/compare_tokens.py b/examples/model-conversion/scripts/utils/compare_tokens.py index a286cb5683..62826ec7a6 100755 --- a/examples/model-conversion/scripts/utils/compare_tokens.py +++ b/examples/model-conversion/scripts/utils/compare_tokens.py @@ -2,7 +2,7 @@ import argparse import sys -from common import compare_tokens # type: ignore +from common import compare_tokens # type: ignore[import-not-found] def parse_arguments(): diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 93e5dcb6c3..0cdd0b5709 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -6,7 +6,7 @@ import re from copy import copy from enum import Enum from inspect import getdoc, isclass -from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints from docstring_parser import parse from pydantic import BaseModel, create_model @@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): # Assert that the parameter has a type annotation if param.annotation == inspect.Parameter.empty: - raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation") + raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""") # Find the parameter's description in the docstring param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) @@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): # Assert that the parameter has a description if not param_doc or not param_doc.description: raise ValueError( - f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") + f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""") # Add parameter details to the schema param_docs.append((param.name, param_doc)) @@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): dynamic_fields[param.name] = ( param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) + dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields) for name, param_doc in param_docs: dynamic_model.model_fields[name].description = param_doc.description @@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: if items != {}: array = {"properties": items} array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") - fields[field_name] = (List[array_type], ...) + fields[field_name] = (list[array_type], ...) # ty: ignore[invalid-type-form] else: fields[field_name] = (list, ...) elif field_type == "object": diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index c89e5076f2..63ceb635de 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -3194,6 +3194,7 @@ class tinyBLAS_PPC { private: + __attribute__((always_inline)) inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) { vec_t vec_C[4]; __builtin_mma_disassemble_acc(vec_C, ACC); @@ -3204,6 +3205,7 @@ class tinyBLAS_PPC { } } + __attribute__((always_inline)) inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) { vec_t vec_C[4]; __builtin_mma_disassemble_acc(vec_C, ACC); diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index d7c8ad8c16..5d8defad20 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1162,12 +1162,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp return nullptr; } + // Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types) + if (ggml_blck_size((enum ggml_type)tensor->type) == 0) { + GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type); + return nullptr; + } + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type if (result == nullptr) { - GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type); + GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type); return nullptr; } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 566958b3a9..221e6fa04e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -16048,6 +16048,7 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev) case 0xE20C: // B570 return 18; case 0xE20B: // B580 + case 0xE211: // Pro B60 return 20; default: return 0; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0a032e9039..c5f92c7700 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3869,6 +3869,8 @@ class LlamaFileType(IntEnum): # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors + MOSTLY_MXFP4_MOE = 38 # except 1d tensors + MOSTLY_NVFP4 = 39 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 57f9fd1a52..5f653d386d 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1300,7 +1300,7 @@ class GGUFWriter: else: raise ValueError("Invalid GGUF metadata value type or value") - return kv_data + return bytes(kv_data) @staticmethod def format_n_bytes_to_str(num: int) -> str: diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index c126f09c50..acbc79258a 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -138,7 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta): if isinstance(meta_noop, tuple): dtype, shape = meta_noop assert callable(shape) - res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) + res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) # ty: ignore[call-top-callable] else: res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 1cd519981a..1d9d9ab7d7 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -91,11 +91,11 @@ class __Quant(ABC): def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: cls.qtype = qtype cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] - cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__quantize_lazy: Any = LazyNumpyTensor._wrap_fn( cls.__quantize_array, meta_noop=(np.uint8, cls.__shape_to_bytes) ) - cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__dequantize_lazy: Any = LazyNumpyTensor._wrap_fn( cls.__dequantize_array, meta_noop=(np.float32, cls.__shape_from_bytes) ) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 028e5748e4..e4ab5e1e4b 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -11,33 +11,33 @@ from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVa try: from sentencepiece import SentencePieceProcessor except ImportError: - SentencePieceProcessor = None + SentencePieceProcessor: Any = None try: - from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found] _filter_valid_tokenizer_files, ) - from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found] SentencePieceTokenizer, ) except ImportError: _mistral_common_installed = False - MistralTokenizer = None - Tekkenizer = None - SentencePieceTokenizer = None - _filter_valid_tokenizer_files = None + MistralTokenizer: Any = None + Tekkenizer: Any = None + SentencePieceTokenizer: Any = None + _filter_valid_tokenizer_files: Any = None else: _mistral_common_installed = True try: - from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found] get_one_valid_tokenizer_file, ) except ImportError: # We still want the conversion to work with older mistral-common versions. - get_one_valid_tokenizer_file = None + get_one_valid_tokenizer_file: Any = None import gguf @@ -703,7 +703,7 @@ class MistralVocab(Vocab): tokenizer_file_path = base_path / tokenizer_file - self.tokenizer = MistralTokenizer.from_file( + self.tokenizer: Any = MistralTokenizer.from_file( tokenizer_file_path ).instruct_tokenizer.tokenizer self.tokenizer_type = ( diff --git a/pyrightconfig.json b/pyrightconfig.json index a7bc007bdc..14d84fdbe7 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,5 +1,5 @@ { - "extraPaths": ["gguf-py", "examples/model-conversion/scripts"], + "extraPaths": ["gguf-py", "examples/model-conversion/scripts", "examples/model-conversion/scripts/utils"], "pythonVersion": "3.9", "pythonPlatform": "All", "reportUnusedImport": "warning", diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 14e75117c4..f43d24ebf1 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -684,6 +684,7 @@ else: sys.exit(1) +assert isinstance(hexsha8_baseline, str) name_baseline = bench_data.get_commit_name(hexsha8_baseline) hexsha8_compare = name_compare = None @@ -717,6 +718,7 @@ else: parser.print_help() sys.exit(1) +assert isinstance(hexsha8_compare, str) name_compare = bench_data.get_commit_name(hexsha8_compare) # Get tool-specific configuration diff --git a/scripts/jinja/jinja-tester.py b/scripts/jinja/jinja-tester.py index a489305ee7..4f79b8da3d 100755 --- a/scripts/jinja/jinja-tester.py +++ b/scripts/jinja/jinja-tester.py @@ -241,10 +241,10 @@ class CodeEditor(QPlainTextEdit): if not self.isReadOnly(): selection = QTextEdit.ExtraSelection() line_color = QColorConstants.Yellow.lighter(160) - selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] - selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] - selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] - selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] + selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] extra_selections.append(selection) self.setExtraSelections(extra_selections) @@ -262,8 +262,8 @@ class CodeEditor(QPlainTextEdit): ) extra = QTextEdit.ExtraSelection() - extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] - extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] + extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] self.setExtraSelections(self.extraSelections() + [extra]) @@ -274,8 +274,8 @@ class CodeEditor(QPlainTextEdit): cursor.select(QTextCursor.SelectionType.LineUnderCursor) extra = QTextEdit.ExtraSelection() - extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] - extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] + extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] self.setExtraSelections(self.extraSelections() + [extra]) @@ -395,8 +395,8 @@ class JinjaTester(QMainWindow): ensure_ascii=ensure_ascii, ) ) - env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) - env.globals["raise_exception"] = raise_exception + env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment] + env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment] try: template = env.from_string(template_str) output = template.render(context) diff --git a/scripts/server-bench.py b/scripts/server-bench.py index 202c35a486..1b557a495a 100755 --- a/scripts/server-bench.py +++ b/scripts/server-bench.py @@ -189,6 +189,7 @@ def benchmark( data: list[dict] = [] + assert isinstance(prompts, list) for i, p in enumerate(prompts): if seed_offset >= 0: random.seed(3 * (seed_offset + 1000 * i) + 1) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8f25d47786..6aa73630c9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) { const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const int32_t seq_idx = ubatch.seq_idx[seq_id]; - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); + // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's + // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl) + const uint32_t n_embd_out = hparams.n_embd_out(); + embd_seq_out[seq_id].resize(n_embd_out); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_RANK: @@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) { // extract sequence embeddings (cleared before processing each batch) auto & embd_seq_out = embd_seq; + // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's + // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl) + const uint32_t n_embd_out = hparams.n_embd_out(); + for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const int32_t seq_idx = ubatch.seq_idx[seq_id]; - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); + embd_seq_out[seq_id].resize(n_embd_out); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_RANK: diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index aac0d41f2b..badcbfd0fb 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #define MAX_REPETITION_THRESHOLD 2000 @@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence( bool is_nested) { size_t last_sym_start = rule.size(); const char * pos = src; + uint64_t n_prev_rules = 1; // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used // (though it's technically the same as -1 now) @@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence( // S' ::= S | llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end()); + // Calculate the total number of rules that will be generated by this repetition + uint64_t total_rules = 1; // Start with 1 for the original rule + if (!no_max && max_times > 0) { + total_rules = max_times; + } else if (min_times > 0) { + total_rules = min_times; + } + + if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) { + throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity"); + } + if (min_times == 0) { rule.resize(last_sym_start); } else { @@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence( if (n_opt > 0) { rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); } + n_prev_rules *= total_rules; + GGML_ASSERT(n_prev_rules >= 1); }; while (*pos) { if (*pos == '"') { // literal string pos++; last_sym_start = rule.size(); + n_prev_rules = 1; while (*pos != '"') { if (!*pos) { throw std::runtime_error("unexpected end of input"); @@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence( start_type = LLAMA_GRETYPE_CHAR_NOT; } last_sym_start = rule.size(); + n_prev_rules = 1; while (*pos != ']') { if (!*pos) { throw std::runtime_error("unexpected end of input"); @@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence( auto token_pair = parse_token(vocab, pos); const char * token_end = token_pair.second; last_sym_start = rule.size(); + n_prev_rules = 1; rule.push_back({type, token_pair.first}); pos = parse_space(token_end, is_nested); } else if (is_word_char(*pos)) { // rule reference @@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence( uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos); pos = parse_space(name_end, is_nested); last_sym_start = rule.size(); + n_prev_rules = 1; rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); } else if (*pos == '(') { // grouping // parse nested alternates into synthesized rule pos = parse_space(pos + 1, true); + uint32_t n_rules_before = symbol_ids.size(); uint32_t sub_rule_id = generate_symbol_id(rule_name); pos = parse_alternates(pos, rule_name, sub_rule_id, true); + n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before); last_sym_start = rule.size(); // output reference to synthesized rule rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); @@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence( pos = parse_space(pos + 1, is_nested); } else if (*pos == '.') { // any char last_sym_start = rule.size(); + n_prev_rules = 1; rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); pos = parse_space(pos + 1, is_nested); } else if (*pos == '*') { @@ -830,32 +853,54 @@ static bool llama_grammar_match_token( static void llama_grammar_advance_stack( const llama_grammar_rules & rules, const llama_grammar_stack & stack, - llama_grammar_stacks & new_stacks) { - if (stack.empty()) { - if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { - new_stacks.emplace_back(stack); + llama_grammar_stacks & new_stacks) { + std::vector todo; + todo.push_back(stack); + + auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) { + return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(), + [](const llama_grammar_element * pa, const llama_grammar_element * pb) { + return pa < pb; // Compare pointer addresses + } + ); + }; + + std::set seen(stack_cmp); + + while (!todo.empty()) { + llama_grammar_stack curr_stack = std::move(todo.back()); + todo.pop_back(); + + if (seen.find( curr_stack) != seen.end()) { + continue; } - return; - } + seen.insert(curr_stack); - const llama_grammar_element * pos = stack.back(); + if (curr_stack.empty()) { + if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) { + new_stacks.emplace_back(std::move(curr_stack)); + } + continue; + } - switch (pos->type) { + const llama_grammar_element * pos = curr_stack.back(); + + switch (pos->type) { case LLAMA_GRETYPE_RULE_REF: { const size_t rule_id = static_cast(pos->value); const llama_grammar_element * subpos = rules[rule_id].data(); do { // init new stack without the top (pos) - llama_grammar_stack new_stack(stack.begin(), stack.end() - 1); + llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1); if (!llama_grammar_is_end_of_sequence(pos + 1)) { // if this rule ref is followed by another element, add that to stack - new_stack.push_back(pos + 1); + next_stack.push_back(pos + 1); } if (!llama_grammar_is_end_of_sequence(subpos)) { // if alternate is nonempty, add to stack - new_stack.push_back(subpos); + next_stack.push_back(subpos); } - llama_grammar_advance_stack(rules, new_stack, new_stacks); + todo.push_back(std::move(next_stack)); while (!llama_grammar_is_end_of_sequence(subpos)) { // scan to end of alternate def subpos++; @@ -874,9 +919,9 @@ static void llama_grammar_advance_stack( case LLAMA_GRETYPE_CHAR_ANY: case LLAMA_GRETYPE_TOKEN: case LLAMA_GRETYPE_TOKEN_NOT: - if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { + if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) { // only add the stack if it's not a duplicate of one we already have - new_stacks.emplace_back(stack); + new_stacks.emplace_back(std::move(curr_stack)); } break; default: @@ -884,6 +929,7 @@ static void llama_grammar_advance_stack( // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // those GGML_ABORT("fatal error"); + } } } diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index 6abf71d6cf..0ba51ba235 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -22,6 +22,7 @@ static void test_calculate_diff_split_no_common(testing & t); static void test_calculate_diff_split_single_char(testing & t); static void test_calculate_diff_split_overlaps(testing & t); static void test_calculate_diff_split_tag_boundaries(testing & t); +static void test_calculate_diff_split_generation_prompt(testing & t); static void test_calculate_diff_split(testing & t); static void test_until_common_prefix_basic(testing & t); @@ -179,6 +180,7 @@ static void test_calculate_diff_split(testing & t) { t.test("calculate_diff_split single char", test_calculate_diff_split_single_char); t.test("calculate_diff_split overlaps", test_calculate_diff_split_overlaps); t.test("calculate_diff_split tag boundaries", test_calculate_diff_split_tag_boundaries); + t.test("calculate_diff_split generation prompt", test_calculate_diff_split_generation_prompt); } static void test_calculate_diff_split_basic(testing & t) { @@ -502,6 +504,39 @@ static void test_calculate_diff_split_tag_boundaries(testing & t) { } } +static void test_calculate_diff_split_generation_prompt(testing & t) { + // ChatML thinking template: left is a prefix of right, generation_prompt is the appended part. + // The trailing \n in left matches the trailing \n in the generation_prompt, causing + // the suffix matcher to steal it and rotate the diff result. + { + // Simplified reproduction: left ends with \n, right = left + "<|im_start|>assistant\n\n" + std::string left = "<|im_start|>user\nHello<|im_end|>\n"; + std::string right = left + "<|im_start|>assistant\n\n"; + diff_split result = calculate_diff_split(left, right); + t.assert_equal("chatml prefix", left, result.prefix); + t.assert_equal("chatml left", "", result.left); + t.assert_equal("chatml right should be generation prompt", + "<|im_start|>assistant\n\n", result.right); + t.assert_equal("chatml suffix", "", result.suffix); + } + + { + // More realistic: longer conversation ending with tool_response + std::string common = + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\nSearch for files<|im_end|>\n" + "<|im_start|>assistant\n\nLet me search.\n\n\n" + "\n\n\n<|im_end|>\n" + "<|im_start|>user\n\nNo files found\n<|im_end|>\n"; + std::string left = common; + std::string right = common + "<|im_start|>assistant\n\n"; + diff_split result = calculate_diff_split(left, right); + t.assert_equal("tool_response left", "", result.left); + t.assert_equal("tool_response right should be generation prompt", + "<|im_start|>assistant\n\n", result.right); + } +} + static void test_until_common_prefix(testing & t) { t.test("until_common_prefix basic", test_until_common_prefix_basic); } diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index faac9e7306..575d240791 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1337,7 +1337,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) { tst.test("I'm\nthinking\n\nHello, world!\nWhat's up?") .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_NONE) - .expect_content("I'm\nthinking\n\nHello, world!\nWhat's up?") + .expect_content("\nI'm\nthinking\n\nHello, world!\nWhat's up?") .run(); tst.test("I'm\nthinking\n\nHello, world!\nWhat's up?") diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 526470a224..4d5d13dd0d 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -788,6 +788,24 @@ static void test_quantifiers() { "0xFF 0x12 0xAB 0x00 0x00 0x00", } ); + test_grammar( + "segfault", + // Grammar + R"""( + root ::= ( [x]* )* + )""", + // Passing strings + { + "", + "x", + "xx" + }, + // Failing strings + { + "y", + "yy" + } + ); } static void test_failure_missing_root() { diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp index 03ae78ff73..6abc43461b 100644 --- a/tests/test-grammar-parser.cpp +++ b/tests/test-grammar-parser.cpp @@ -145,6 +145,10 @@ int main() root ::= "a"{,}" )"""); + verify_failure(R"""( + root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99} + )"""); + verify_failure(R"""( root ::= "a"{,10}" )"""); diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index fd45d5ada8..25f432a2f5 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -123,25 +123,27 @@ int main() std::vector> expected_stacks = { { - {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_CHAR, 40}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { {LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_CHAR, 97}, }, - { - {LLAMA_GRETYPE_RULE_REF, 5}, - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, - { - {LLAMA_GRETYPE_RULE_REF, 5}, - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, { {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_CHAR, 61}, @@ -149,26 +151,24 @@ int main() {LLAMA_GRETYPE_CHAR, 40}, }, { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_CHAR, 97}, - }, - { - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, - { - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, - { - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_CHAR, 40}, }}; auto index = 0; @@ -195,9 +195,9 @@ int main() } std::vector next_candidates; - next_candidates.resize(24); + next_candidates.resize(23); - for (size_t i = 0; i < 24; ++i) + for (size_t i = 0; i < 23; ++i) { uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point cp[0] = 37 + i; @@ -210,7 +210,6 @@ int main() {0, 37}, {1, 38}, {2, 39}, - {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -268,6 +267,7 @@ int main() {0, 37}, {1, 38}, {2, 39}, + {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -287,13 +287,11 @@ int main() {20, 57}, {21, 58}, {22, 59}, - {23, 60}, }, { {0, 37}, {1, 38}, {2, 39}, - {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -351,6 +349,7 @@ int main() {0, 37}, {1, 38}, {2, 39}, + {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -370,7 +369,6 @@ int main() {20, 57}, {21, 58}, {22, 59}, - {23, 60}, }, }; diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 93e697607e..25af4ee63b 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -16,8 +16,7 @@ import random import unicodedata from pathlib import Path -from typing import Any, Iterator, cast -from typing_extensions import Buffer +from typing import Any, Iterator import cffi from transformers import AutoTokenizer, PreTrainedTokenizer @@ -114,7 +113,7 @@ class LibLlamaModel: while num < 0 and len(self.text_buff) < (16 << 20): self.text_buff = self.ffi.new("uint8_t[]", -2 * num) num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) - return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType] class Tokenizer: @@ -438,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl decode_errors = 0 MAX_ERRORS = 10 - logger.info("%s: %s" % (generator.__qualname__, "ini")) + logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini")) for text in generator: # print(repr(text), text.encode()) # print(repr(text), hex(ord(text[0])), text.encode()) @@ -477,7 +476,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl break t_total = time.perf_counter() - t_start - logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") + logger.info(f"{getattr(generator, '__qualname__', '')}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") def main(argv: list[str] | None = None): diff --git a/tools/cli/README.md b/tools/cli/README.md index 22d3fc87e9..c344cab2a8 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -83,7 +83,7 @@ | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | -| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | diff --git a/tools/completion/README.md b/tools/completion/README.md index f868c2c7d7..b5eeba7334 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -166,7 +166,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | -| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 10bf6d8ac1..00c6536589 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -418,7 +418,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive\n"); printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"); - printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n"); + printf(" example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"); printf(" (default: unused)\n"); printf(" -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo\n"); printf(" (default: unused)\n"); diff --git a/tools/server/README.md b/tools/server/README.md index df59e2d9b7..554444d74b 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -100,7 +100,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | -| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py index 0c57a2df04..c816816eaf 100644 --- a/tools/server/bench/bench.py +++ b/tools/server/bench/bench.py @@ -285,7 +285,7 @@ def start_server_background(args): } server_process = subprocess.Popen( args, - **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] # ty: ignore[no-matching-overload] def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 39d232c2e4..7d543b9292 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -415,6 +415,7 @@ task_params server_task::params_from_json_cmpl( params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string()); params.sampling.generation_prompt = params.chat_parser_params.generation_prompt; + SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str()); params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false); if (data.contains("chat_parser")) { params.chat_parser_params.parser.load(data.at("chat_parser").get()); diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py index ba41cd44ea..b1a5ab9da4 100755 --- a/tools/server/tests/unit/test_tool_call.py +++ b/tools/server/tests/unit/test_tool_call.py @@ -9,6 +9,7 @@ sys.path.insert(0, str(path)) from utils import * from enum import Enum +from typing import TypedDict server: ServerProcess @@ -29,56 +30,73 @@ class CompletionMode(Enum): NORMAL = "normal" STREAMED = "streamed" -TEST_TOOL = { - "type":"function", - "function": { - "name": "test", - "description": "", - "parameters": { - "type": "object", - "properties": { - "success": {"type": "boolean", "const": True}, - }, - "required": ["success"] - } - } -} +class ToolParameters(TypedDict): + type: str + properties: dict[str, dict] + required: list[str] -PYTHON_TOOL = { - "type": "function", - "function": { - "name": "python", - "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", - "parameters": { - "type": "object", - "properties": { +class ToolFunction(TypedDict): + name: str + description: str + parameters: ToolParameters + +class ToolDefinition(TypedDict): + type: str + function: ToolFunction + +TEST_TOOL = ToolDefinition( + type = "function", + function = ToolFunction( + name = "test", + description = "", + parameters = ToolParameters( + type = "object", + properties = { + "success": { + "type": "boolean", + "const": True, + }, + }, + required = ["success"], + ), + ), +) + +PYTHON_TOOL = ToolDefinition( + type = "function", + function = ToolFunction( + name = "python", + description = "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", + parameters = ToolParameters( + type = "object", + properties = { "code": { "type": "string", - "description": "The code to run in the ipython interpreter." - } + "description": "The code to run in the ipython interpreter.", + }, }, - "required": ["code"] - } - } -} + required = ["code"], + ), + ), +) -WEATHER_TOOL = { - "type":"function", - "function":{ - "name":"get_current_weather", - "description":"Get the current weather in a given location", - "parameters":{ - "type":"object", - "properties":{ - "location":{ - "type":"string", - "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'" - } - }, - "required":["location"] - } - } -} +WEATHER_TOOL = ToolDefinition( + type = "function", + function = ToolFunction( + name = "get_current_weather", + description = "Get the current weather in a given location", + parameters = ToolParameters( + type = "object", + properties = { + "location": { + "type": "string", + "description": "The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'", + }, + }, + required = ["location"], + ), + ), +) def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs): body = server.make_any_request("POST", "/v1/chat/completions", data={ diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts index 39aaf561bb..ae9dd3ce8f 100644 --- a/tools/server/webui/src/lib/constants/settings-config.ts +++ b/tools/server/webui/src/lib/constants/settings-config.ts @@ -127,7 +127,7 @@ export const SETTING_CONFIG_INFO: Record = { fullHeightCodeBlocks: 'Always display code blocks at their full natural height, overriding any height limits.', showRawModelNames: - 'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.', + 'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.', mcpServers: 'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.', mcpServerUsageStats: diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts index a6d7d6572f..50c32034a6 100644 --- a/tools/server/webui/src/lib/stores/models.svelte.ts +++ b/tools/server/webui/src/lib/stores/models.svelte.ts @@ -457,7 +457,7 @@ class ModelsStore { /** * Select a model by its model name (used for syncing with conversation model) - * @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest") + * @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF") */ selectModelByName(modelName: string): void { const option = this.models.find((model) => model.model === modelName); diff --git a/ty.toml b/ty.toml new file mode 100644 index 0000000000..bcd23db9b8 --- /dev/null +++ b/ty.toml @@ -0,0 +1,30 @@ +[environment] +extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"] +python-version = "3.10" + +[rules] +deprecated = "warn" + +[src] +exclude = [ + "./tools/mtmd/legacy-models/**", +] + +[[overrides]] +include = [ + "./tools/server/tests/**", +] + +[overrides.rules] +unresolved-reference = "ignore" +unresolved-import = "ignore" +unresolved-attribute = "ignore" + +[[overrides]] +include = [ + "./examples/pydantic_models_to_grammar.py", +] + +[overrides.rules] +unsupported-operator = "ignore" +not-subscriptable = "ignore"