From e6ec21e62f45a247d3a3d2cb8f164141853b1a9b Mon Sep 17 00:00:00 2001
From: shalinib-ibm <Shalini.Salomi.Bodapati@ibm.com>
Date: Sat, 21 Mar 2026 04:41:45 +0530
Subject: [PATCH 01/10] ggml-cpu: add always_inline to tinyBLAS_PPC accumulator
 saves (#20791)

Explicitly mark save_acc and add_save_Acc with always_inline
in tinyBLAS_PPC. This ensures the compiler keeps MMA accumulator
disassembly within kernel's register context, preventing un-necessary
stask spills.

Signed-off-by: Shalini Salomi Bodapati <Shalini.Salomi.Bodapati@ibm.com>
---
 ggml/src/ggml-cpu/llamafile/sgemm.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index c89e5076f2..63ceb635de 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -3194,6 +3194,7 @@ class tinyBLAS_PPC {
 
   private:
 
+    __attribute__((always_inline))
     inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
         vec_t vec_C[4];
         __builtin_mma_disassemble_acc(vec_C, ACC);
@@ -3204,6 +3205,7 @@ class tinyBLAS_PPC {
         }
     }
 
+    __attribute__((always_inline))
     inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
         vec_t vec_C[4];
         __builtin_mma_disassemble_acc(vec_C, ACC);

From b1c70e2e5419ced91eec570b9aabc050afe185e1 Mon Sep 17 00:00:00 2001
From: "Piotr Wilkin (ilintar)" <piotr.wilkin@syndatis.com>
Date: Sat, 21 Mar 2026 00:19:04 +0100
Subject: [PATCH 02/10] common/parser: fix nasty bug causing subtle corruption
 of generation prompt (#20825)

---
 common/chat-auto-parser-helpers.cpp | 15 +++++++++++++
 tests/test-chat-auto-parser.cpp     | 35 +++++++++++++++++++++++++++++
 tests/test-chat.cpp                 |  2 +-
 tools/server/server-task.cpp        |  1 +
 4 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp
index 9dcdde2501..3a7a5c13a7 100644
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -188,6 +188,21 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
         result.suffix = "";
         // pick prefix = all as representation
     }
+
+    // When left has no unique content (result.left is empty), left is entirely
+    // shared with right. The simultaneous prefix/suffix segment matching can
+    // incorrectly consume trailing segments of left as suffix when those same
+    // segments also appear at the end of right (e.g. "\n" at the end of both
+    // the shared content and the generation prompt). This rotates the diff.
+    // Fix: if left is a prefix of right, enforce that directly.
+    if (result.left.empty() && !result.right.empty() &&
+            left.size() <= right.size() &&
+            right.substr(0, left.size()) == left) {
+        result.prefix = left;
+        result.suffix = "";
+        result.right  = right.substr(left.size());
+    }
+
     return result;
 }
 
diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp
index 6abf71d6cf..0ba51ba235 100644
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@@ -22,6 +22,7 @@ static void test_calculate_diff_split_no_common(testing & t);
 static void test_calculate_diff_split_single_char(testing & t);
 static void test_calculate_diff_split_overlaps(testing & t);
 static void test_calculate_diff_split_tag_boundaries(testing & t);
+static void test_calculate_diff_split_generation_prompt(testing & t);
 static void test_calculate_diff_split(testing & t);
 
 static void test_until_common_prefix_basic(testing & t);
@@ -179,6 +180,7 @@ static void test_calculate_diff_split(testing & t) {
     t.test("calculate_diff_split single char", test_calculate_diff_split_single_char);
     t.test("calculate_diff_split overlaps", test_calculate_diff_split_overlaps);
     t.test("calculate_diff_split tag boundaries", test_calculate_diff_split_tag_boundaries);
+    t.test("calculate_diff_split generation prompt", test_calculate_diff_split_generation_prompt);
 }
 
 static void test_calculate_diff_split_basic(testing & t) {
@@ -502,6 +504,39 @@ static void test_calculate_diff_split_tag_boundaries(testing & t) {
     }
 }
 
+static void test_calculate_diff_split_generation_prompt(testing & t) {
+    // ChatML thinking template: left is a prefix of right, generation_prompt is the appended part.
+    // The trailing \n in left matches the trailing \n in the generation_prompt, causing
+    // the suffix matcher to steal it and rotate the diff result.
+    {
+        // Simplified reproduction: left ends with \n, right = left + "<|im_start|>assistant\n<think>\n"
+        std::string left  = "<|im_start|>user\nHello<|im_end|>\n";
+        std::string right = left + "<|im_start|>assistant\n<think>\n";
+        diff_split result = calculate_diff_split(left, right);
+        t.assert_equal("chatml prefix", left, result.prefix);
+        t.assert_equal("chatml left", "", result.left);
+        t.assert_equal("chatml right should be generation prompt",
+                       "<|im_start|>assistant\n<think>\n", result.right);
+        t.assert_equal("chatml suffix", "", result.suffix);
+    }
+
+    {
+        // More realistic: longer conversation ending with tool_response
+        std::string common =
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\nSearch for files<|im_end|>\n"
+            "<|im_start|>assistant\n<think>\nLet me search.\n</think>\n\n"
+            "<tool_call>\n<function=search>\n</function>\n</tool_call><|im_end|>\n"
+            "<|im_start|>user\n<tool_response>\nNo files found\n</tool_response><|im_end|>\n";
+        std::string left  = common;
+        std::string right = common + "<|im_start|>assistant\n<think>\n";
+        diff_split result = calculate_diff_split(left, right);
+        t.assert_equal("tool_response left", "", result.left);
+        t.assert_equal("tool_response right should be generation prompt",
+                       "<|im_start|>assistant\n<think>\n", result.right);
+    }
+}
+
 static void test_until_common_prefix(testing & t) {
     t.test("until_common_prefix basic", test_until_common_prefix_basic);
 }
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index faac9e7306..575d240791 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1337,7 +1337,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
         tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
             .enable_thinking(true)
             .reasoning_format(COMMON_REASONING_FORMAT_NONE)
-            .expect_content("<think>I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
+            .expect_content("<think>\nI'm\nthinking\n</think>\nHello, world!\nWhat's up?")
             .run();
 
         tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 39d232c2e4..7d543b9292 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -415,6 +415,7 @@ task_params server_task::params_from_json_cmpl(
         params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
         params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
         params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
+        SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str());
         params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         if (data.contains("chat_parser")) {
             params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());

From cea560f483f0f03e828a6c76e78821debdecbe06 Mon Sep 17 00:00:00 2001
From: Matt Corallo <649246+TheBlueMatt@users.noreply.github.com>
Date: Sat, 21 Mar 2026 04:22:51 +0000
Subject: [PATCH 03/10] Add shader count for Intel Arc Pro B60 (#20818)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 566958b3a9..221e6fa04e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -16048,6 +16048,7 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
     case 0xE20C:  // B570
         return 18;
     case 0xE20B:  // B580
+    case 0xE211:  // Pro B60
         return 20;
     default:
         return 0;

From 29b28a98242692e352df38254c9a2bb9fa79c884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 21 Mar 2026 08:54:34 +0100
Subject: [PATCH 04/10] ci : switch from pyright to ty (#20826)

* type fixes

* switch to ty

* tweak rules

* tweak more rules

* more tweaks

* final tweak

* use common import-not-found rule
---
 .github/workflows/python-type-check.yml       |  27 +++--
 convert_hf_to_gguf.py                         |  25 ++--
 convert_llama_ggml_to_gguf.py                 |   4 +-
 convert_lora_to_gguf.py                       |   6 +-
 examples/json_schema_to_grammar.py            |   9 +-
 .../scripts/embedding/run-original-model.py   |  16 +--
 .../scripts/utils/compare_tokens.py           |   2 +-
 examples/pydantic_models_to_grammar.py        |  10 +-
 gguf-py/gguf/gguf_writer.py                   |   2 +-
 gguf-py/gguf/lazy.py                          |   2 +-
 gguf-py/gguf/quants.py                        |   4 +-
 gguf-py/gguf/vocab.py                         |  24 ++--
 pyrightconfig.json                            |   2 +-
 scripts/compare-llama-bench.py                |   2 +
 scripts/jinja/jinja-tester.py                 |  20 ++--
 scripts/server-bench.py                       |   1 +
 tests/test-tokenizer-random.py                |   9 +-
 tools/server/bench/bench.py                   |   2 +-
 tools/server/tests/unit/test_tool_call.py     | 108 ++++++++++--------
 ty.toml                                       |  30 +++++
 20 files changed, 181 insertions(+), 124 deletions(-)
 create mode 100644 ty.toml

diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
index e801a9f42e..2c62678726 100644
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,15 +4,17 @@ on:
   push:
     paths:
       - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
+      - 'ty.toml'
       - '**.py'
       - '**/requirements*.txt'
+      # - 'pyrightconfig.json'
   pull_request:
     paths:
       - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
+      - 'ty.toml'
       - '**.py'
       - '**/requirements*.txt'
+      # - 'pyrightconfig.json'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -20,8 +22,8 @@ concurrency:
 
 jobs:
   python-type-check:
-    runs-on: ubuntu-latest
-    name: pyright type-check
+    runs-on: ubuntu-slim
+    name: python type-check
     steps:
       - name: Check out source repository
         uses: actions/checkout@v6
@@ -29,10 +31,13 @@ jobs:
         uses: actions/setup-python@v6
         with:
           python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt
-      - name: Type-check with Pyright
-        uses: jakebailey/pyright-action@v2
-        with:
-          version: 1.1.382
-          level: warning
-          warnings: true
+          pip-install: -r requirements/requirements-all.txt ty==0.0.24
+      # - name: Type-check with Pyright
+      #   uses: jakebailey/pyright-action@v2
+      #   with:
+      #     version: 1.1.382
+      #     level: warning
+      #     warnings: true
+      - name: Type-check with ty
+        run: |
+            ty check --output-format=github
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8cfd0bf2f5..087e9f926f 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -31,10 +31,10 @@ import gguf
 from gguf.vocab import MistralTokenizerType, MistralVocab
 
 try:
-    from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
         SentencePieceTokenizer,
     )
 
@@ -45,9 +45,9 @@ except ImportError:
     _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
 
     _mistral_common_installed = False
-    TokenizerVersion = None
-    Tekkenizer = None
-    SentencePieceTokenizer = None
+    TokenizerVersion: Any = None
+    Tekkenizer: Any = None
+    SentencePieceTokenizer: Any = None
     _mistral_import_error_msg = (
         "Mistral format requires `mistral-common` to be installed. Please run "
         "`pip install mistral-common[image,audio]` to install it."
@@ -220,7 +220,7 @@ class ModelBase:
                     if weight_map is None or not isinstance(weight_map, dict):
                         raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
                     tensor_names_from_index.update(weight_map.keys())
-                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
+                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment]
                     part_names = sorted(part_dict.keys())
             else:
                 weight_map = {}
@@ -5882,7 +5882,7 @@ class InternLM2Model(TextModel):
             logger.error(f'Error: Missing {tokenizer_path}')
             sys.exit(1)
 
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 
@@ -6203,7 +6203,7 @@ class BertModel(TextModel):
 
             vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
         else:
-            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
             sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
             assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 
@@ -8880,7 +8880,7 @@ class T5Model(TextModel):
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
         # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@@ -9017,7 +9017,7 @@ class T5EncoderModel(TextModel):
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
         # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@@ -12279,6 +12279,7 @@ class LazyTorchTensor(gguf.LazyBase):
             kwargs = {}
 
         if func is torch.Tensor.numpy:
+            assert len(args)
             return args[0].numpy()
 
         return cls._wrap_fn(func)(*args, **kwargs)
diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py
index 29b14e98dd..52827e6690 100755
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -112,11 +112,11 @@ class Tensor:
         (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
         assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
         assert name_len < 4096, 'Absurd tensor name length'
-        quant = gguf.GGML_QUANT_SIZES.get(dtype)
+        self.dtype = gguf.GGMLQuantizationType(dtype)
+        quant = gguf.GGML_QUANT_SIZES.get(self.dtype)
         assert quant is not None, 'Unknown tensor type'
         (blksize, tysize) = quant
         offset += 12
-        self.dtype= gguf.GGMLQuantizationType(dtype)
         self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
         offset += 4 * n_dims
         self.name = bytes(data[offset:offset + name_len])
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 871ce82422..ee98d0cf97 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -199,10 +199,13 @@ class LoraTorchTensor:
             kwargs = {}
 
         if func is torch.permute:
+            assert len(args)
             return type(args[0]).permute(*args, **kwargs)
         elif func is torch.reshape:
+            assert len(args)
             return type(args[0]).reshape(*args, **kwargs)
         elif func is torch.stack:
+            assert len(args)
             assert isinstance(args[0], Sequence)
             dim = kwargs.get("dim", 0)
             assert dim == 0
@@ -211,6 +214,7 @@ class LoraTorchTensor:
                 torch.stack([b._lora_B for b in args[0]], dim),
             )
         elif func is torch.cat:
+            assert len(args)
             assert isinstance(args[0], Sequence)
             dim = kwargs.get("dim", 0)
             assert dim == 0
@@ -362,7 +366,7 @@ if __name__ == '__main__':
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
 
-        class LoraModel(model_class):
+        class LoraModel(model_class):  # ty: ignore[unsupported-base]
             model_arch = model_class.model_arch
 
             lora_alpha: float
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 018ba49b24..077fcfacac 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
     return f'({result})?' if min_items == 0 else result
 
 def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
-    has_min = min_value != None
-    has_max = max_value != None
-
     def digit_range(from_char: str, to_char: str):
         out.append("[")
         if from_char == to_char:
@@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
                 out.append(to_str[i])
                 out.append("]")
 
-    if has_min and has_max:
+    if min_value is not None and max_value is not None:
         if min_value < 0 and max_value < 0:
             out.append("\"-\" (")
             _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
@@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
 
     less_decimals = max(decimals_left - 1, 1)
 
-    if has_min:
+    if min_value is not None:
         if min_value < 0:
             out.append("\"-\" (")
             _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
@@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
                 more_digits(length - 1, less_decimals)
         return
 
-    if has_max:
+    if max_value is not None:
         if max_value >= 0:
             if top_level:
                 out.append("\"-\" [1-9] ")
diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py
index 0802cbcf4a..614c1a86b9 100755
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
         print("Using SentenceTransformer to apply all numbered layers")
         model = SentenceTransformer(model_path)
         tokenizer = model.tokenizer
-        config = model[0].auto_model.config  # type: ignore
+        config = model[0].auto_model.config
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
         print(f"Model file: {type(model).__module__}")
 
         # Verify the model is using the correct sliding window
-        if hasattr(model.config, 'sliding_window'):  # type: ignore
-            print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
+        if hasattr(model.config, 'sliding_window'):
+            print(f"Model's sliding_window: {model.config.sliding_window}")
         else:
             print("Model config does not have sliding_window attribute")
 
@@ -152,7 +152,7 @@ def main():
         device = next(model.parameters()).device
     else:
         # For SentenceTransformer, get device from the underlying model
-        device = next(model[0].auto_model.parameters()).device  # type: ignore
+        device = next(model[0].auto_model.parameters()).device
 
     model_name = os.path.basename(model_path)
 
@@ -177,7 +177,7 @@ def main():
                 print(f"{token_id:6d} -> '{token_str}'")
 
             print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
-            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
+            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")
         else:
             # Standard approach: use base model output only
             encoded = tokenizer(
@@ -205,12 +205,12 @@ def main():
             print(f"Embedding dimension: {all_embeddings.shape[1]}")
 
         if len(all_embeddings.shape) == 1:
-            n_embd = all_embeddings.shape[0]  # type: ignore
+            n_embd = all_embeddings.shape[0]
             n_embd_count = 1
             all_embeddings = all_embeddings.reshape(1, -1)
         else:
-            n_embd = all_embeddings.shape[1]  # type: ignore
-            n_embd_count = all_embeddings.shape[0]  # type: ignore
+            n_embd = all_embeddings.shape[1]
+            n_embd_count = all_embeddings.shape[0]
 
         print()
 
diff --git a/examples/model-conversion/scripts/utils/compare_tokens.py b/examples/model-conversion/scripts/utils/compare_tokens.py
index a286cb5683..62826ec7a6 100755
--- a/examples/model-conversion/scripts/utils/compare_tokens.py
+++ b/examples/model-conversion/scripts/utils/compare_tokens.py
@@ -2,7 +2,7 @@
 
 import argparse
 import sys
-from common import compare_tokens  # type: ignore
+from common import compare_tokens  # type: ignore[import-not-found]
 
 
 def parse_arguments():
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 93e5dcb6c3..0cdd0b5709 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -6,7 +6,7 @@ import re
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
 
 from docstring_parser import parse
 from pydantic import BaseModel, create_model
@@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
 
         # Assert that the parameter has a type annotation
         if param.annotation == inspect.Parameter.empty:
-            raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")
+            raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""")
 
         # Find the parameter's description in the docstring
         param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
@@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
         # Assert that the parameter has a description
         if not param_doc or not param_doc.description:
             raise ValueError(
-                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
+                f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""")
 
         # Add parameter details to the schema
         param_docs.append((param.name, param_doc))
@@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
         dynamic_fields[param.name] = (
             param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
     # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+    dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields)
 
     for name, param_doc in param_docs:
         dynamic_model.model_fields[name].description = param_doc.description
@@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                     if items != {}:
                         array = {"properties": items}
                         array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)
+                        fields[field_name] = (list[array_type], ...)  # ty: ignore[invalid-type-form]
                     else:
                         fields[field_name] = (list, ...)
                 elif field_type == "object":
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 57f9fd1a52..5f653d386d 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1300,7 +1300,7 @@ class GGUFWriter:
         else:
             raise ValueError("Invalid GGUF metadata value type or value")
 
-        return kv_data
+        return bytes(kv_data)
 
     @staticmethod
     def format_n_bytes_to_str(num: int) -> str:
diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
index c126f09c50..acbc79258a 100644
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -138,7 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
                     if isinstance(meta_noop, tuple):
                         dtype, shape = meta_noop
                         assert callable(shape)
-                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
+                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))  # ty: ignore[call-top-callable]
                     else:
                         res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
 
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 1cd519981a..1d9d9ab7d7 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -91,11 +91,11 @@ class __Quant(ABC):
     def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
         cls.qtype = qtype
         cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
-        cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
+        cls.__quantize_lazy: Any = LazyNumpyTensor._wrap_fn(
             cls.__quantize_array,
             meta_noop=(np.uint8, cls.__shape_to_bytes)
         )
-        cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
+        cls.__dequantize_lazy: Any = LazyNumpyTensor._wrap_fn(
             cls.__dequantize_array,
             meta_noop=(np.float32, cls.__shape_from_bytes)
         )
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 028e5748e4..e4ab5e1e4b 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -11,33 +11,33 @@ from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVa
 try:
     from sentencepiece import SentencePieceProcessor
 except ImportError:
-    SentencePieceProcessor = None
+    SentencePieceProcessor: Any = None
 
 try:
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
         _filter_valid_tokenizer_files,
     )
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
         SentencePieceTokenizer,
     )
 except ImportError:
     _mistral_common_installed = False
-    MistralTokenizer = None
-    Tekkenizer = None
-    SentencePieceTokenizer = None
-    _filter_valid_tokenizer_files = None
+    MistralTokenizer: Any = None
+    Tekkenizer: Any = None
+    SentencePieceTokenizer: Any = None
+    _filter_valid_tokenizer_files: Any = None
 else:
     _mistral_common_installed = True
 
 try:
-    from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
         get_one_valid_tokenizer_file,
     )
 except ImportError:
     # We still want the conversion to work with older mistral-common versions.
-    get_one_valid_tokenizer_file = None
+    get_one_valid_tokenizer_file: Any = None
 
 
 import gguf
@@ -703,7 +703,7 @@ class MistralVocab(Vocab):
 
             tokenizer_file_path = base_path / tokenizer_file
 
-        self.tokenizer = MistralTokenizer.from_file(
+        self.tokenizer: Any = MistralTokenizer.from_file(
             tokenizer_file_path
         ).instruct_tokenizer.tokenizer
         self.tokenizer_type = (
diff --git a/pyrightconfig.json b/pyrightconfig.json
index a7bc007bdc..14d84fdbe7 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,5 +1,5 @@
 {
-  "extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
+  "extraPaths": ["gguf-py", "examples/model-conversion/scripts", "examples/model-conversion/scripts/utils"],
   "pythonVersion": "3.9",
   "pythonPlatform": "All",
   "reportUnusedImport": "warning",
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 14e75117c4..f43d24ebf1 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -684,6 +684,7 @@ else:
     sys.exit(1)
 
 
+assert isinstance(hexsha8_baseline, str)
 name_baseline = bench_data.get_commit_name(hexsha8_baseline)
 
 hexsha8_compare = name_compare = None
@@ -717,6 +718,7 @@ else:
     parser.print_help()
     sys.exit(1)
 
+assert isinstance(hexsha8_compare, str)
 name_compare = bench_data.get_commit_name(hexsha8_compare)
 
 # Get tool-specific configuration
diff --git a/scripts/jinja/jinja-tester.py b/scripts/jinja/jinja-tester.py
index a489305ee7..4f79b8da3d 100755
--- a/scripts/jinja/jinja-tester.py
+++ b/scripts/jinja/jinja-tester.py
@@ -241,10 +241,10 @@ class CodeEditor(QPlainTextEdit):
         if not self.isReadOnly():
             selection = QTextEdit.ExtraSelection()
             line_color = QColorConstants.Yellow.lighter(160)
-            selection.format.setBackground(line_color)  # pyright: ignore[reportAttributeAccessIssue]
-            selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True)  # pyright: ignore[reportAttributeAccessIssue]
-            selection.cursor = self.textCursor()  # pyright: ignore[reportAttributeAccessIssue]
-            selection.cursor.clearSelection()  # pyright: ignore[reportAttributeAccessIssue]
+            selection.format.setBackground(line_color)  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True)  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            selection.cursor = self.textCursor()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            selection.cursor.clearSelection()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
             extra_selections.append(selection)
         self.setExtraSelections(extra_selections)
 
@@ -262,8 +262,8 @@ class CodeEditor(QPlainTextEdit):
                 )
 
             extra = QTextEdit.ExtraSelection()
-            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue]
-            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue]
+            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
 
             self.setExtraSelections(self.extraSelections() + [extra])
 
@@ -274,8 +274,8 @@ class CodeEditor(QPlainTextEdit):
             cursor.select(QTextCursor.SelectionType.LineUnderCursor)
 
             extra = QTextEdit.ExtraSelection()
-            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue]
-            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue]
+            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
 
             self.setExtraSelections(self.extraSelections() + [extra])
 
@@ -395,8 +395,8 @@ class JinjaTester(QMainWindow):
                 ensure_ascii=ensure_ascii,
             )
         )
-        env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
-        env.globals["raise_exception"] = raise_exception
+        env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)  # ty: ignore[invalid-assignment]
+        env.globals["raise_exception"] = raise_exception  # ty: ignore[invalid-assignment]
         try:
             template = env.from_string(template_str)
             output = template.render(context)
diff --git a/scripts/server-bench.py b/scripts/server-bench.py
index 202c35a486..1b557a495a 100755
--- a/scripts/server-bench.py
+++ b/scripts/server-bench.py
@@ -189,6 +189,7 @@ def benchmark(
 
         data: list[dict] = []
 
+        assert isinstance(prompts, list)
         for i, p in enumerate(prompts):
             if seed_offset >= 0:
                 random.seed(3 * (seed_offset + 1000 * i) + 1)
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 93e697607e..25af4ee63b 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -16,8 +16,7 @@ import random
 import unicodedata
 
 from pathlib import Path
-from typing import Any, Iterator, cast
-from typing_extensions import Buffer
+from typing import Any, Iterator
 
 import cffi
 from transformers import AutoTokenizer, PreTrainedTokenizer
@@ -114,7 +113,7 @@ class LibLlamaModel:
         while num < 0 and len(self.text_buff) < (16 << 20):
             self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
             num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
-        return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+        return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType]
 
 
 class Tokenizer:
@@ -438,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
     decode_errors = 0
     MAX_ERRORS = 10
 
-    logger.info("%s: %s" % (generator.__qualname__, "ini"))
+    logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini"))
     for text in generator:
         # print(repr(text), text.encode())
         # print(repr(text), hex(ord(text[0])), text.encode())
@@ -477,7 +476,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
             break
 
     t_total = time.perf_counter() - t_start
-    logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
+    logger.info(f"{getattr(generator, '__qualname__', '')}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
 
 
 def main(argv: list[str] | None = None):
diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py
index 0c57a2df04..c816816eaf 100644
--- a/tools/server/bench/bench.py
+++ b/tools/server/bench/bench.py
@@ -285,7 +285,7 @@ def start_server_background(args):
     }
     server_process = subprocess.Popen(
         args,
-        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue] # ty: ignore[no-matching-overload]
 
     def server_log(in_stream, out_stream):
         for line in iter(in_stream.readline, b''):
diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py
index ba41cd44ea..b1a5ab9da4 100755
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@@ -9,6 +9,7 @@ sys.path.insert(0, str(path))
 
 from utils import *
 from enum import Enum
+from typing import TypedDict
 
 server: ServerProcess
 
@@ -29,56 +30,73 @@ class CompletionMode(Enum):
     NORMAL = "normal"
     STREAMED = "streamed"
 
-TEST_TOOL = {
-    "type":"function",
-    "function": {
-        "name": "test",
-        "description": "",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "success": {"type": "boolean", "const": True},
-            },
-            "required": ["success"]
-        }
-    }
-}
+class ToolParameters(TypedDict):
+    type: str
+    properties: dict[str, dict]
+    required: list[str]
 
-PYTHON_TOOL = {
-    "type": "function",
-    "function": {
-        "name": "python",
-        "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters": {
-            "type": "object",
-            "properties": {
+class ToolFunction(TypedDict):
+    name: str
+    description: str
+    parameters: ToolParameters
+
+class ToolDefinition(TypedDict):
+    type: str
+    function: ToolFunction
+
+TEST_TOOL = ToolDefinition(
+    type = "function",
+    function = ToolFunction(
+        name = "test",
+        description = "",
+        parameters = ToolParameters(
+            type = "object",
+            properties = {
+                "success": {
+                    "type": "boolean",
+                    "const": True,
+                },
+            },
+            required = ["success"],
+        ),
+    ),
+)
+
+PYTHON_TOOL = ToolDefinition(
+    type = "function",
+    function = ToolFunction(
+        name = "python",
+        description = "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        parameters = ToolParameters(
+            type = "object",
+            properties = {
                 "code": {
                     "type": "string",
-                    "description": "The code to run in the ipython interpreter."
-                }
+                    "description": "The code to run in the ipython interpreter.",
+                },
             },
-            "required": ["code"]
-        }
-    }
-}
+            required = ["code"],
+        ),
+    ),
+)
 
-WEATHER_TOOL = {
-  "type":"function",
-  "function":{
-    "name":"get_current_weather",
-    "description":"Get the current weather in a given location",
-    "parameters":{
-      "type":"object",
-      "properties":{
-        "location":{
-          "type":"string",
-          "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
-        }
-      },
-      "required":["location"]
-    }
-  }
-}
+WEATHER_TOOL = ToolDefinition(
+    type = "function",
+    function = ToolFunction(
+        name = "get_current_weather",
+        description = "Get the current weather in a given location",
+        parameters = ToolParameters(
+            type = "object",
+            properties = {
+                "location": {
+                    "type": "string",
+                    "description": "The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'",
+                },
+            },
+            required = ["location"],
+        ),
+    ),
+)
 
 def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
     body = server.make_any_request("POST", "/v1/chat/completions", data={
diff --git a/ty.toml b/ty.toml
new file mode 100644
index 0000000000..bcd23db9b8
--- /dev/null
+++ b/ty.toml
@@ -0,0 +1,30 @@
+[environment]
+extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
+python-version = "3.10"
+
+[rules]
+deprecated = "warn"
+
+[src]
+exclude = [
+    "./tools/mtmd/legacy-models/**",
+]
+
+[[overrides]]
+include = [
+    "./tools/server/tests/**",
+]
+
+[overrides.rules]
+unresolved-reference = "ignore"
+unresolved-import = "ignore"
+unresolved-attribute = "ignore"
+
+[[overrides]]
+include = [
+    "./examples/pydantic_models_to_grammar.py",
+]
+
+[overrides.rules]
+unsupported-operator = "ignore"
+not-subscriptable = "ignore"

From eac9c6ea83ff2b72ba3b5459a58c44990823f2cf Mon Sep 17 00:00:00 2001
From: Michael Wand <michael@baybridgeaquarium.com>
Date: Sat, 21 Mar 2026 04:35:21 -0700
Subject: [PATCH 05/10] Convert: Make NVFP4 and MXFP4 HF conversions say
 NVFP4/MXFP4 instead of BF16 (#20730)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Corrected convert script for NVFP4 naming and updated gguf constants

* Add mostly_MXFP4 to FileType

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* simplify

* set initial value [no ci]

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 convert_hf_to_gguf.py     | 12 ++++++++++--
 gguf-py/gguf/constants.py |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 087e9f926f..dba190b480 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -145,6 +145,7 @@ class ModelBase:
         self.model_name = model_name
         self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
         self._is_nvfp4 = False
+        self._is_mxfp4 = False
 
         # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
         # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@@ -712,6 +713,7 @@ class ModelBase:
     def prepare_tensors(self):
         # detect NVFP4 quantization (ModelOpt format)
         quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
+        quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
         quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
         quant_config_file = self.dir_model / "hf_quant_config.json"
 
@@ -728,6 +730,7 @@ class ModelBase:
                 quant_algo = "NVFP4"
 
         self._is_nvfp4 = quant_algo == "NVFP4"
+        self._is_mxfp4 = quant_method == "mxfp4"
 
         # NVFP4 weights are repacked and written directly to gguf_writer.
         # This must run before dequant_model so NVFP4 tensors are removed
@@ -876,6 +879,12 @@ class ModelBase:
         if self.metadata.name is None:
             self.metadata.name = self.dir_model.name
 
+        if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16):
+            if self._is_nvfp4:
+                self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4
+            elif self._is_mxfp4:
+                self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
+
         # Generate parameter weight class (useful for leader boards) if not yet determined
         if self.metadata.size_label is None and total_params > 0:
             self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
@@ -11125,8 +11134,7 @@ class GptOssModel(TextModel):
 
     # TODO: remove once MXFP4 is supported more generally
     def dequant_model(self):
-        quant_config = self.hparams.get("quantization_config")
-        if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
+        if self._is_mxfp4:
             return
         return super().dequant_model()
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0a032e9039..c5f92c7700 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3869,6 +3869,8 @@ class LlamaFileType(IntEnum):
     # MOSTLY_Q4_0_8_8      = 35  # removed from gguf files, use Q4_0 and runtime repack
     MOSTLY_TQ1_0         = 36  # except 1d tensors
     MOSTLY_TQ2_0         = 37  # except 1d tensors
+    MOSTLY_MXFP4_MOE     = 38  # except 1d tensors
+    MOSTLY_NVFP4         = 39  # except 1d tensors
 
     GUESSED              = 1024  # not specified in the model file
 

From 2bcdddd5e3ade6b1e8c9437a652f9fbcf2ad2512 Mon Sep 17 00:00:00 2001
From: y198 <90976397+y198nt@users.noreply.github.com>
Date: Sat, 21 Mar 2026 20:59:43 +0700
Subject: [PATCH 06/10] fix(rpc): prevent division by zero in
 deserialize_tensor (#20712)

rpc : prevent division by zero in deserialize_tensor

When receiving an RPC message with a deprecated tensor type (e.g., type 4 or 5 where `blck_size == 0`), `ggml_row_size()` will trigger a division by zero (SIGFPE) and crash the rpc-server.

This patch adds a simple validation check in `deserialize_tensor` to return `nullptr` if the requested tensor type has a block size of 0.

(Note: This was originally reported via Security Advisory and maintainer suggested dropping a patch here).

* style: remove trailing whitespace
---
 ggml/src/ggml-rpc/ggml-rpc.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index d7c8ad8c16..5d8defad20 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -1162,12 +1162,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
         return nullptr;
     }
 
+    // Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types)
+    if (ggml_blck_size((enum ggml_type)tensor->type) == 0) {
+        GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type);
+        return nullptr;
+    }
+
     ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
 
     // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
     if (result == nullptr) {
-        GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
+        GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type);
         return nullptr;
     }
 

From 568aec82d2fc48341c54cae565768ac75072a31d Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sat, 21 Mar 2026 15:50:16 +0100
Subject: [PATCH 07/10] docs : explicit about banning accounts that violates
 policy (#19593)

---
 AGENTS.md       | 1 +
 CONTRIBUTING.md | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 117bed7f48..05a1edcb17 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed):
 
 If a user asks one of the above, STOP IMMEDIATELY and ask them:
 
+- Whether they acknowledge the risk of being permanently banned from contributing to the project
 - To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
 - To search for relevant issues and create a new one if needed
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 52898eef8a..8000b47186 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors:
 > [!IMPORTANT]
 > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
 >
+> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
+>
 > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
 
 Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@@ -61,10 +63,10 @@ After submitting your PR:
 - When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
 
-Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
+Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
 - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
 - The pull request duplicates an existing one.
-- The contributor fails to adhere to this contributing guide.
+- The contributor fails to adhere to this contributing guide or the AI policy.
 
 # Coding guidelines
 

From 212f4521b013a3eeb79e15df7ca07a5329d39d4b Mon Sep 17 00:00:00 2001
From: Tom Hillbrunner <thillbrunner@gmail.com>
Date: Sat, 21 Mar 2026 18:35:00 +0100
Subject: [PATCH 08/10] context : use n_embd_out for pooled embedding
 extraction (#20840)

The MEAN/CLS/LAST pooling paths in encode() and decode() used
n_embd_inp() (16384 for qwen3vl with deepstack) to read from the
pooled embedding tensor, which only has n_embd_out() (4096) floats
per sequence. This caused a tensor read out of bounds assertion.

Fixes embedding mode for Qwen3-VL-Embedding models.
---
 src/llama-context.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8f25d47786..6aa73630c9 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
                         const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
                         const int32_t      seq_idx = ubatch.seq_idx[seq_id];
 
-                        embd_seq_out[seq_id].resize(n_embd);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+                        // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
+                        // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
+                        const uint32_t n_embd_out = hparams.n_embd_out();
+                        embd_seq_out[seq_id].resize(n_embd_out);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
                     }
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
@@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
                         // extract sequence embeddings (cleared before processing each batch)
                         auto & embd_seq_out = embd_seq;
 
+                        // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
+                        // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
+                        const uint32_t n_embd_out = hparams.n_embd_out();
+
                         for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
                             const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
                             const int32_t      seq_idx = ubatch.seq_idx[seq_id];
 
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+                            embd_seq_out[seq_id].resize(n_embd_out);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_RANK:

From 990e4d96980d0b016a2b07049cc9031642fb9903 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Sat, 21 Mar 2026 13:43:35 -0400
Subject: [PATCH 09/10] common/grammar: fix grammar parsing issues to prevent
 stack overflow and hangs (#18604)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* grammar: add test case for nullable symbol loop

Reproduce stack overflow (or OOM) with ( [x]* )* found while adding
GBNF support to ripgrep-edit.

llama-server reproducer:

curl \
  -X POST \
  -d '{
    "messages": [{ "role": "user", "content": "write yes" }],
    "grammar": "root ::= ( [x]* )*"
  }' \
  -H "Content-Type: application/json" \
  http://localhost:8811/v1/chat/completions

* grammar: prevent stack overflow with nullable symbol loop

Fix a potential stack overflow in llama_grammar_advance_stack that
could occur when processing grammars with nullable symbols that lead
to infinite derivations of empty strings. The fix introduces cycle
detection by tracking visited stacks to prevent infinite recursion.

rg-edit regexp: llama_grammar_advance_stack
rg-edit extra-args: -A20
rg-edit directive: """Rewrite: fix the following segfault:

[..]
⚫ Testing segfault. Grammar:
            root ::= ( [x]* )*

            root ::= ( [x]* )*

Segmentation fault         build/bin/test-grammar-integration"""

gptel-context:
(("~/llama.cpp/src/llama-grammar.cpp")
 ("~/llama.cpp/tests/test-grammar-integration.cpp")
 ("~/llama.cpp/grammars/./list.gbnf")
 ("~/llama.cpp/grammars/./json_arr.gbnf")
 ("~/llama.cpp/grammars/./json.gbnf")
 ("~/llama.cpp/grammars/./japanese.gbnf")
 ("~/llama.cpp/grammars/./english.gbnf")
 ("~/llama.cpp/grammars/./chess.gbnf")
 ("~/llama.cpp/grammars/./c.gbnf")
 ("~/llama.cpp/grammars/./arithmetic.gbnf")
 ("~/llama.cpp/grammars/./README.md"))

* grammar: convert recursive llama_grammar_advance_stack to iterative

This change converts the function to an iterative approach using
explicit stacks, which prevents deep recursion and eliminates the risk
of stack overflow.

rg-edit regexp: llama_grammar_advance_stack
rg-edit extra-args: -A30
rg-edit directive: """Rewrite: fix the following segfault:

[..]
⚫ Testing segfault. Grammar:
            root ::= ( [x]* )*

            root ::= ( [x]* )*

Segmentation fault         build/bin/test-grammar-integration

convert from recursive to interactive"""

gptel-context:
(("~/llama.cpp/src/llama-grammar.cpp")
 ("~/llama.cpp/tests/test-grammar-integration.cpp")
 ("~/llama.cpp/grammars/./list.gbnf")
 ("~/llama.cpp/grammars/./json_arr.gbnf")
 ("~/llama.cpp/grammars/./json.gbnf")
 ("~/llama.cpp/grammars/./japanese.gbnf")
 ("~/llama.cpp/grammars/./english.gbnf")
 ("~/llama.cpp/grammars/./chess.gbnf")
 ("~/llama.cpp/grammars/./c.gbnf")
 ("~/llama.cpp/grammars/./arithmetic.gbnf")
 ("~/llama.cpp/grammars/./README.md"))

v2: Added a `std::set` to perform tree-based lookups with O(N log N)
complexity. Testing with a parallel run of `test-grammar-integration`
shows a double-digit percentage increase in runtime. An
`unordered_set` with O(1) hashing was also evaluated, but the overhead
of constructing hash keys from pointers made it significantly slower
than the rbtree implementation that only requires an ordering
operator. The performance regression in the test suite appears
justified by the overall reduction in algorithmic complexity.

Co-developed-by: Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>

* grammar: add test case for hang in repetition grammar processing

This commit adds a new test case to the grammar integration tests that
specifically targets a hang scenario in the repetition grammar parser
found while adding GBNF support to ripgrep-edit.

llama-server reproducer:

curl \
  -X POST \
  -d '{
    "messages": [{ "role": "user", "content": "write yes" }],
    "grammar": "root ::= (([^x]*){0,99}){0,99}"
  }' \
  -H "Content-Type: application/json" \
  http://localhost:8811/v1/chat/completions

* grammar: add repetition threshold check

The change introduces a maximum repetition threshold to avoid
excessive rule expansion during grammar parsing. When parsing
repetition patterns like {m,n}, the parser now calculates the
potential number of rules that would be generated and throws an error
if the product of previous rules and new rules exceeds the threshold.

A test case was added to verify the threshold is properly enforced for
deeply nested repetition patterns that would otherwise cause hangs.
---
 src/llama-grammar.cpp              | 74 ++++++++++++++++++++++++------
 tests/test-grammar-integration.cpp | 18 ++++++++
 tests/test-grammar-parser.cpp      |  4 ++
 tests/test-llama-grammar.cpp       | 74 +++++++++++++++---------------
 4 files changed, 118 insertions(+), 52 deletions(-)

diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index aac0d41f2b..badcbfd0fb 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <algorithm>
 #include <cstdint>
+#include <set>
 #include <stdexcept>
 
 #define MAX_REPETITION_THRESHOLD 2000
@@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence(
         bool               is_nested) {
     size_t last_sym_start = rule.size();
     const char * pos = src;
+    uint64_t n_prev_rules = 1;
 
     // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
     // (though it's technically the same as -1 now)
@@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence(
         //            S'     ::= S |
 
         llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
+        // Calculate the total number of rules that will be generated by this repetition
+        uint64_t total_rules = 1; // Start with 1 for the original rule
+        if (!no_max && max_times > 0) {
+            total_rules = max_times;
+        } else if (min_times > 0) {
+            total_rules = min_times;
+        }
+
+        if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) {
+            throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity");
+        }
+
         if (min_times == 0) {
             rule.resize(last_sym_start);
         } else {
@@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence(
         if (n_opt > 0) {
             rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
         }
+        n_prev_rules *= total_rules;
+        GGML_ASSERT(n_prev_rules >= 1);
     };
 
     while (*pos) {
         if (*pos == '"') { // literal string
             pos++;
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             while (*pos != '"') {
                 if (!*pos) {
                     throw std::runtime_error("unexpected end of input");
@@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence(
                 start_type = LLAMA_GRETYPE_CHAR_NOT;
             }
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             while (*pos != ']') {
                 if (!*pos) {
                     throw std::runtime_error("unexpected end of input");
@@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence(
             auto token_pair = parse_token(vocab, pos);
             const char * token_end  = token_pair.second;
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             rule.push_back({type, token_pair.first});
             pos = parse_space(token_end, is_nested);
         } else if (is_word_char(*pos)) { // rule reference
@@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence(
             uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
             pos = parse_space(name_end, is_nested);
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
         } else if (*pos == '(') { // grouping
             // parse nested alternates into synthesized rule
             pos = parse_space(pos + 1, true);
+            uint32_t n_rules_before = symbol_ids.size();
             uint32_t sub_rule_id = generate_symbol_id(rule_name);
             pos = parse_alternates(pos, rule_name, sub_rule_id, true);
+            n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before);
             last_sym_start = rule.size();
             // output reference to synthesized rule
             rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
@@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence(
             pos = parse_space(pos + 1, is_nested);
         } else if (*pos == '.') { // any char
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
             pos = parse_space(pos + 1, is_nested);
         } else if (*pos == '*') {
@@ -830,32 +853,54 @@ static bool llama_grammar_match_token(
 static void llama_grammar_advance_stack(
         const llama_grammar_rules  & rules,
         const llama_grammar_stack  & stack,
-              llama_grammar_stacks & new_stacks) {
-    if (stack.empty()) {
-        if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
-            new_stacks.emplace_back(stack);
+        llama_grammar_stacks & new_stacks) {
+    std::vector<llama_grammar_stack> todo;
+    todo.push_back(stack);
+
+    auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) {
+        return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(),
+            [](const llama_grammar_element * pa, const llama_grammar_element * pb) {
+                return pa < pb;  // Compare pointer addresses
+            }
+        );
+    };
+
+    std::set<llama_grammar_stack, decltype(stack_cmp)> seen(stack_cmp);
+
+    while (!todo.empty()) {
+        llama_grammar_stack curr_stack = std::move(todo.back());
+        todo.pop_back();
+
+        if (seen.find( curr_stack) != seen.end()) {
+            continue;
         }
-        return;
-    }
+        seen.insert(curr_stack);
 
-    const llama_grammar_element * pos = stack.back();
+        if (curr_stack.empty()) {
+            if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
+                new_stacks.emplace_back(std::move(curr_stack));
+            }
+            continue;
+        }
 
-    switch (pos->type) {
+        const llama_grammar_element * pos = curr_stack.back();
+
+        switch (pos->type) {
         case LLAMA_GRETYPE_RULE_REF: {
             const size_t                  rule_id = static_cast<size_t>(pos->value);
             const llama_grammar_element * subpos  = rules[rule_id].data();
             do {
                 // init new stack without the top (pos)
-                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1);
                 if (!llama_grammar_is_end_of_sequence(pos + 1)) {
                     // if this rule ref is followed by another element, add that to stack
-                    new_stack.push_back(pos + 1);
+                    next_stack.push_back(pos + 1);
                 }
                 if (!llama_grammar_is_end_of_sequence(subpos)) {
                     // if alternate is nonempty, add to stack
-                    new_stack.push_back(subpos);
+                    next_stack.push_back(subpos);
                 }
-                llama_grammar_advance_stack(rules, new_stack, new_stacks);
+                todo.push_back(std::move(next_stack));
                 while (!llama_grammar_is_end_of_sequence(subpos)) {
                     // scan to end of alternate def
                     subpos++;
@@ -874,9 +919,9 @@ static void llama_grammar_advance_stack(
         case LLAMA_GRETYPE_CHAR_ANY:
         case LLAMA_GRETYPE_TOKEN:
         case LLAMA_GRETYPE_TOKEN_NOT:
-            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+            if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
                 // only add the stack if it's not a duplicate of one we already have
-                new_stacks.emplace_back(stack);
+                new_stacks.emplace_back(std::move(curr_stack));
             }
             break;
         default:
@@ -884,6 +929,7 @@ static void llama_grammar_advance_stack(
             // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
             // those
             GGML_ABORT("fatal error");
+        }
     }
 }
 
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 526470a224..4d5d13dd0d 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -788,6 +788,24 @@ static void test_quantifiers() {
             "0xFF 0x12 0xAB 0x00 0x00 0x00",
         }
     );
+    test_grammar(
+        "segfault",
+        // Grammar
+        R"""(
+            root ::= ( [x]* )*
+        )""",
+        // Passing strings
+        {
+            "",
+            "x",
+            "xx"
+        },
+        // Failing strings
+        {
+            "y",
+            "yy"
+        }
+    );
 }
 
 static void test_failure_missing_root() {
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index 03ae78ff73..6abc43461b 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -145,6 +145,10 @@ int main()
         root ::= "a"{,}"
     )""");
 
+    verify_failure(R"""(
+        root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99}
+    )""");
+
     verify_failure(R"""(
         root ::= "a"{,10}"
     )""");
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index fd45d5ada8..25f432a2f5 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -123,25 +123,27 @@ int main()
 
     std::vector<std::vector<llama_grammar_element>> expected_stacks = {
         {
-            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 40},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
             {LLAMA_GRETYPE_CHAR, 61},
             {LLAMA_GRETYPE_RULE_REF, 7},
             {LLAMA_GRETYPE_CHAR, 97},
         },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
         {
             {LLAMA_GRETYPE_RULE_REF, 5},
             {LLAMA_GRETYPE_CHAR, 61},
@@ -149,26 +151,24 @@ int main()
             {LLAMA_GRETYPE_CHAR, 40},
         },
         {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
             {LLAMA_GRETYPE_CHAR, 61},
             {LLAMA_GRETYPE_RULE_REF, 7},
             {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
         }};
 
     auto index = 0;
@@ -195,9 +195,9 @@ int main()
     }
 
     std::vector<llama_grammar_candidate> next_candidates;
-    next_candidates.resize(24);
+    next_candidates.resize(23);
 
-    for (size_t i = 0; i < 24; ++i)
+    for (size_t i = 0; i < 23; ++i)
     {
         uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
         cp[0] = 37 + i;
@@ -210,7 +210,6 @@ int main()
             {0, 37},
             {1, 38},
             {2, 39},
-            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -268,6 +267,7 @@ int main()
             {0, 37},
             {1, 38},
             {2, 39},
+            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -287,13 +287,11 @@ int main()
             {20, 57},
             {21, 58},
             {22, 59},
-            {23, 60},
         },
         {
             {0, 37},
             {1, 38},
             {2, 39},
-            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -351,6 +349,7 @@ int main()
             {0, 37},
             {1, 38},
             {2, 39},
+            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -370,7 +369,6 @@ int main()
             {20, 57},
             {21, 58},
             {22, 59},
-            {23, 60},
         },
     };
 

From 3306dbaef7553da03971c617e48cd27d00328bb4 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Sat, 21 Mar 2026 16:00:26 -0500
Subject: [PATCH 10/10] misc : prefer ggml-org models in docs and examples
 (#20827)

* misc : prefer ggml-org models in docs and examples

Prefer referring to known-good quantizations under ggml-org rather than
3rd-party uploaders.

* remove accidentally committed file
---
 common/arg.cpp                                          | 2 +-
 tools/cli/README.md                                     | 2 +-
 tools/completion/README.md                              | 2 +-
 tools/llama-bench/llama-bench.cpp                       | 2 +-
 tools/server/README.md                                  | 2 +-
 tools/server/webui/src/lib/constants/settings-config.ts | 2 +-
 tools/server/webui/src/lib/stores/models.svelte.ts      | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index aad70ec546..c6a2dcbf2d 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2583,7 +2583,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
         "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
         "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
-        "example: unsloth/phi-4-GGUF:q4_k_m\n"
+        "example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"
         "(default: unused)",
         [](common_params & params, const std::string & value) {
             params.model.hf_repo = value;
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 22d3fc87e9..c344cab2a8 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -83,7 +83,7 @@
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
-| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index f868c2c7d7..b5eeba7334 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -166,7 +166,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
-| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index b0f1d6b936..21173576cc 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -418,7 +418,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -m, --model <filename>                      (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
     printf("  -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
     printf("                                              default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
-    printf("                                              example: unsloth/phi-4-GGUF:Q4_K_M\n");
+    printf("                                              example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n");
     printf("                                              (default: unused)\n");
     printf("  -hff, --hf-file <file>                      Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
     printf("                                              (default: unused)\n");
diff --git a/tools/server/README.md b/tools/server/README.md
index df59e2d9b7..554444d74b 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -100,7 +100,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
-| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts
index 39aaf561bb..ae9dd3ce8f 100644
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -127,7 +127,7 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 	fullHeightCodeBlocks:
 		'Always display code blocks at their full natural height, overriding any height limits.',
 	showRawModelNames:
-		'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.',
+		'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.',
 	mcpServers:
 		'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.',
 	mcpServerUsageStats:
diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index a6d7d6572f..50c32034a6 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -457,7 +457,7 @@ class ModelsStore {
 
 	/**
 	 * Select a model by its model name (used for syncing with conversation model)
-	 * @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest")
+	 * @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF")
 	 */
 	selectModelByName(modelName: string): void {
 		const option = this.models.find((model) => model.model === modelName);