diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
index e801a9f42e..2c62678726 100644
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,15 +4,17 @@ on:
   push:
     paths:
       - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
+      - 'ty.toml'
       - '**.py'
       - '**/requirements*.txt'
+      # - 'pyrightconfig.json'
   pull_request:
     paths:
       - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
+      - 'ty.toml'
       - '**.py'
       - '**/requirements*.txt'
+      # - 'pyrightconfig.json'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -20,8 +22,8 @@ concurrency:
 
 jobs:
   python-type-check:
-    runs-on: ubuntu-latest
-    name: pyright type-check
+    runs-on: ubuntu-slim
+    name: python type-check
     steps:
       - name: Check out source repository
         uses: actions/checkout@v6
@@ -29,10 +31,13 @@ jobs:
         uses: actions/setup-python@v6
         with:
           python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt
-      - name: Type-check with Pyright
-        uses: jakebailey/pyright-action@v2
-        with:
-          version: 1.1.382
-          level: warning
-          warnings: true
+          pip-install: -r requirements/requirements-all.txt ty==0.0.24
+      # - name: Type-check with Pyright
+      #   uses: jakebailey/pyright-action@v2
+      #   with:
+      #     version: 1.1.382
+      #     level: warning
+      #     warnings: true
+      - name: Type-check with ty
+        run: |
+            ty check --output-format=github
diff --git a/AGENTS.md b/AGENTS.md
index 117bed7f48..05a1edcb17 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed):
 
 If a user asks one of the above, STOP IMMEDIATELY and ask them:
 
+- Whether they acknowledge the risk of being permanently banned from contributing to the project
 - To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
 - To search for relevant issues and create a new one if needed
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 52898eef8a..8000b47186 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors:
 > [!IMPORTANT]
 > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
 >
+> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
+>
 > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
 
 Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@@ -61,10 +63,10 @@ After submitting your PR:
 - When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
 
-Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
+Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
 - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
 - The pull request duplicates an existing one.
-- The contributor fails to adhere to this contributing guide.
+- The contributor fails to adhere to this contributing guide or the AI policy.
 
 # Coding guidelines
 
diff --git a/common/arg.cpp b/common/arg.cpp
index 5f221b7263..ff12646a70 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2595,7 +2595,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
         "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
         "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
-        "example: unsloth/phi-4-GGUF:q4_k_m\n"
+        "example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"
         "(default: unused)",
         [](common_params & params, const std::string & value) {
             params.model.hf_repo = value;
diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp
index 9dcdde2501..3a7a5c13a7 100644
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -188,6 +188,21 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
         result.suffix = "";
         // pick prefix = all as representation
     }
+
+    // When left has no unique content (result.left is empty), left is entirely
+    // shared with right. The simultaneous prefix/suffix segment matching can
+    // incorrectly consume trailing segments of left as suffix when those same
+    // segments also appear at the end of right (e.g. "\n" at the end of both
+    // the shared content and the generation prompt). This rotates the diff.
+    // Fix: if left is a prefix of right, enforce that directly.
+    if (result.left.empty() && !result.right.empty() &&
+            left.size() <= right.size() &&
+            right.substr(0, left.size()) == left) {
+        result.prefix = left;
+        result.suffix = "";
+        result.right  = right.substr(left.size());
+    }
+
     return result;
 }
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8cfd0bf2f5..dba190b480 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -31,10 +31,10 @@ import gguf
 from gguf.vocab import MistralTokenizerType, MistralVocab
 
 try:
-    from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
         SentencePieceTokenizer,
     )
 
@@ -45,9 +45,9 @@ except ImportError:
     _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
 
     _mistral_common_installed = False
-    TokenizerVersion = None
-    Tekkenizer = None
-    SentencePieceTokenizer = None
+    TokenizerVersion: Any = None
+    Tekkenizer: Any = None
+    SentencePieceTokenizer: Any = None
     _mistral_import_error_msg = (
         "Mistral format requires `mistral-common` to be installed. Please run "
         "`pip install mistral-common[image,audio]` to install it."
@@ -145,6 +145,7 @@ class ModelBase:
         self.model_name = model_name
         self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
         self._is_nvfp4 = False
+        self._is_mxfp4 = False
 
         # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
         # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@@ -220,7 +221,7 @@ class ModelBase:
                     if weight_map is None or not isinstance(weight_map, dict):
                         raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
                     tensor_names_from_index.update(weight_map.keys())
-                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
+                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment]
                     part_names = sorted(part_dict.keys())
             else:
                 weight_map = {}
@@ -712,6 +713,7 @@ class ModelBase:
     def prepare_tensors(self):
         # detect NVFP4 quantization (ModelOpt format)
         quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
+        quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
         quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
         quant_config_file = self.dir_model / "hf_quant_config.json"
 
@@ -728,6 +730,7 @@ class ModelBase:
                 quant_algo = "NVFP4"
 
         self._is_nvfp4 = quant_algo == "NVFP4"
+        self._is_mxfp4 = quant_method == "mxfp4"
 
         # NVFP4 weights are repacked and written directly to gguf_writer.
         # This must run before dequant_model so NVFP4 tensors are removed
@@ -876,6 +879,12 @@ class ModelBase:
         if self.metadata.name is None:
             self.metadata.name = self.dir_model.name
 
+        if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16):
+            if self._is_nvfp4:
+                self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4
+            elif self._is_mxfp4:
+                self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
+
         # Generate parameter weight class (useful for leader boards) if not yet determined
         if self.metadata.size_label is None and total_params > 0:
             self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
@@ -5882,7 +5891,7 @@ class InternLM2Model(TextModel):
             logger.error(f'Error: Missing {tokenizer_path}')
             sys.exit(1)
 
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 
@@ -6203,7 +6212,7 @@ class BertModel(TextModel):
 
             vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
         else:
-            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
             sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
             assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 
@@ -8880,7 +8889,7 @@ class T5Model(TextModel):
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
         # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@@ -9017,7 +9026,7 @@ class T5EncoderModel(TextModel):
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
         # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@@ -11125,8 +11134,7 @@ class GptOssModel(TextModel):
 
     # TODO: remove once MXFP4 is supported more generally
     def dequant_model(self):
-        quant_config = self.hparams.get("quantization_config")
-        if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
+        if self._is_mxfp4:
             return
         return super().dequant_model()
 
@@ -12279,6 +12287,7 @@ class LazyTorchTensor(gguf.LazyBase):
             kwargs = {}
 
         if func is torch.Tensor.numpy:
+            assert len(args)
             return args[0].numpy()
 
         return cls._wrap_fn(func)(*args, **kwargs)
diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py
index 29b14e98dd..52827e6690 100755
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -112,11 +112,11 @@ class Tensor:
         (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
         assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
         assert name_len < 4096, 'Absurd tensor name length'
-        quant = gguf.GGML_QUANT_SIZES.get(dtype)
+        self.dtype = gguf.GGMLQuantizationType(dtype)
+        quant = gguf.GGML_QUANT_SIZES.get(self.dtype)
         assert quant is not None, 'Unknown tensor type'
         (blksize, tysize) = quant
         offset += 12
-        self.dtype= gguf.GGMLQuantizationType(dtype)
         self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
         offset += 4 * n_dims
         self.name = bytes(data[offset:offset + name_len])
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 871ce82422..ee98d0cf97 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -199,10 +199,13 @@ class LoraTorchTensor:
             kwargs = {}
 
         if func is torch.permute:
+            assert len(args)
             return type(args[0]).permute(*args, **kwargs)
         elif func is torch.reshape:
+            assert len(args)
             return type(args[0]).reshape(*args, **kwargs)
         elif func is torch.stack:
+            assert len(args)
             assert isinstance(args[0], Sequence)
             dim = kwargs.get("dim", 0)
             assert dim == 0
@@ -211,6 +214,7 @@ class LoraTorchTensor:
                 torch.stack([b._lora_B for b in args[0]], dim),
             )
         elif func is torch.cat:
+            assert len(args)
             assert isinstance(args[0], Sequence)
             dim = kwargs.get("dim", 0)
             assert dim == 0
@@ -362,7 +366,7 @@ if __name__ == '__main__':
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
 
-        class LoraModel(model_class):
+        class LoraModel(model_class):  # ty: ignore[unsupported-base]
             model_arch = model_class.model_arch
 
             lora_alpha: float
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 018ba49b24..077fcfacac 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
     return f'({result})?' if min_items == 0 else result
 
 def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
-    has_min = min_value != None
-    has_max = max_value != None
-
     def digit_range(from_char: str, to_char: str):
         out.append("[")
         if from_char == to_char:
@@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
                 out.append(to_str[i])
                 out.append("]")
 
-    if has_min and has_max:
+    if min_value is not None and max_value is not None:
         if min_value < 0 and max_value < 0:
             out.append("\"-\" (")
             _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
@@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
 
     less_decimals = max(decimals_left - 1, 1)
 
-    if has_min:
+    if min_value is not None:
         if min_value < 0:
             out.append("\"-\" (")
             _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
@@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
                 more_digits(length - 1, less_decimals)
         return
 
-    if has_max:
+    if max_value is not None:
         if max_value >= 0:
             if top_level:
                 out.append("\"-\" [1-9] ")
diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py
index 0802cbcf4a..614c1a86b9 100755
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
         print("Using SentenceTransformer to apply all numbered layers")
         model = SentenceTransformer(model_path)
         tokenizer = model.tokenizer
-        config = model[0].auto_model.config  # type: ignore
+        config = model[0].auto_model.config
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
@@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
         print(f"Model file: {type(model).__module__}")
 
         # Verify the model is using the correct sliding window
-        if hasattr(model.config, 'sliding_window'):  # type: ignore
-            print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
+        if hasattr(model.config, 'sliding_window'):
+            print(f"Model's sliding_window: {model.config.sliding_window}")
         else:
             print("Model config does not have sliding_window attribute")
 
@@ -152,7 +152,7 @@ def main():
         device = next(model.parameters()).device
     else:
         # For SentenceTransformer, get device from the underlying model
-        device = next(model[0].auto_model.parameters()).device  # type: ignore
+        device = next(model[0].auto_model.parameters()).device
 
     model_name = os.path.basename(model_path)
 
@@ -177,7 +177,7 @@ def main():
                 print(f"{token_id:6d} -> '{token_str}'")
 
             print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
-            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
+            print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")
         else:
             # Standard approach: use base model output only
             encoded = tokenizer(
@@ -205,12 +205,12 @@ def main():
             print(f"Embedding dimension: {all_embeddings.shape[1]}")
 
         if len(all_embeddings.shape) == 1:
-            n_embd = all_embeddings.shape[0]  # type: ignore
+            n_embd = all_embeddings.shape[0]
             n_embd_count = 1
             all_embeddings = all_embeddings.reshape(1, -1)
         else:
-            n_embd = all_embeddings.shape[1]  # type: ignore
-            n_embd_count = all_embeddings.shape[0]  # type: ignore
+            n_embd = all_embeddings.shape[1]
+            n_embd_count = all_embeddings.shape[0]
 
         print()
 
diff --git a/examples/model-conversion/scripts/utils/compare_tokens.py b/examples/model-conversion/scripts/utils/compare_tokens.py
index a286cb5683..62826ec7a6 100755
--- a/examples/model-conversion/scripts/utils/compare_tokens.py
+++ b/examples/model-conversion/scripts/utils/compare_tokens.py
@@ -2,7 +2,7 @@
 
 import argparse
 import sys
-from common import compare_tokens  # type: ignore
+from common import compare_tokens  # type: ignore[import-not-found]
 
 
 def parse_arguments():
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 93e5dcb6c3..0cdd0b5709 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -6,7 +6,7 @@ import re
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
 
 from docstring_parser import parse
 from pydantic import BaseModel, create_model
@@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
 
         # Assert that the parameter has a type annotation
         if param.annotation == inspect.Parameter.empty:
-            raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")
+            raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""")
 
         # Find the parameter's description in the docstring
         param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
@@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
         # Assert that the parameter has a description
         if not param_doc or not param_doc.description:
             raise ValueError(
-                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
+                f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""")
 
         # Add parameter details to the schema
         param_docs.append((param.name, param_doc))
@@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
         dynamic_fields[param.name] = (
             param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
     # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+    dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields)
 
     for name, param_doc in param_docs:
         dynamic_model.model_fields[name].description = param_doc.description
@@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                     if items != {}:
                         array = {"properties": items}
                         array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)
+                        fields[field_name] = (list[array_type], ...)  # ty: ignore[invalid-type-form]
                     else:
                         fields[field_name] = (list, ...)
                 elif field_type == "object":
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
index c89e5076f2..63ceb635de 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -3194,6 +3194,7 @@ class tinyBLAS_PPC {
 
   private:
 
+    __attribute__((always_inline))
     inline void save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
         vec_t vec_C[4];
         __builtin_mma_disassemble_acc(vec_C, ACC);
@@ -3204,6 +3205,7 @@ class tinyBLAS_PPC {
         }
     }
 
+    __attribute__((always_inline))
     inline void add_save_acc(acc_t * ACC, int64_t ii, int64_t jj) {
         vec_t vec_C[4];
         __builtin_mma_disassemble_acc(vec_C, ACC);
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index d7c8ad8c16..5d8defad20 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -1162,12 +1162,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
         return nullptr;
     }
 
+    // Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types)
+    if (ggml_blck_size((enum ggml_type)tensor->type) == 0) {
+        GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type);
+        return nullptr;
+    }
+
     ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
 
     // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
     if (result == nullptr) {
-        GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
+        GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type);
         return nullptr;
     }
 
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 566958b3a9..221e6fa04e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -16048,6 +16048,7 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
     case 0xE20C:  // B570
         return 18;
     case 0xE20B:  // B580
+    case 0xE211:  // Pro B60
         return 20;
     default:
         return 0;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0a032e9039..c5f92c7700 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3869,6 +3869,8 @@ class LlamaFileType(IntEnum):
     # MOSTLY_Q4_0_8_8      = 35  # removed from gguf files, use Q4_0 and runtime repack
     MOSTLY_TQ1_0         = 36  # except 1d tensors
     MOSTLY_TQ2_0         = 37  # except 1d tensors
+    MOSTLY_MXFP4_MOE     = 38  # except 1d tensors
+    MOSTLY_NVFP4         = 39  # except 1d tensors
 
     GUESSED              = 1024  # not specified in the model file
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 57f9fd1a52..5f653d386d 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1300,7 +1300,7 @@ class GGUFWriter:
         else:
             raise ValueError("Invalid GGUF metadata value type or value")
 
-        return kv_data
+        return bytes(kv_data)
 
     @staticmethod
     def format_n_bytes_to_str(num: int) -> str:
diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
index c126f09c50..acbc79258a 100644
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -138,7 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
                     if isinstance(meta_noop, tuple):
                         dtype, shape = meta_noop
                         assert callable(shape)
-                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
+                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))  # ty: ignore[call-top-callable]
                     else:
                         res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
 
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 1cd519981a..1d9d9ab7d7 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -91,11 +91,11 @@ class __Quant(ABC):
     def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
         cls.qtype = qtype
         cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
-        cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
+        cls.__quantize_lazy: Any = LazyNumpyTensor._wrap_fn(
             cls.__quantize_array,
             meta_noop=(np.uint8, cls.__shape_to_bytes)
         )
-        cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
+        cls.__dequantize_lazy: Any = LazyNumpyTensor._wrap_fn(
             cls.__dequantize_array,
             meta_noop=(np.float32, cls.__shape_from_bytes)
         )
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 028e5748e4..e4ab5e1e4b 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -11,33 +11,33 @@ from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVa
 try:
     from sentencepiece import SentencePieceProcessor
 except ImportError:
-    SentencePieceProcessor = None
+    SentencePieceProcessor: Any = None
 
 try:
-    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
-    from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
         _filter_valid_tokenizer_files,
     )
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
         SentencePieceTokenizer,
     )
 except ImportError:
     _mistral_common_installed = False
-    MistralTokenizer = None
-    Tekkenizer = None
-    SentencePieceTokenizer = None
-    _filter_valid_tokenizer_files = None
+    MistralTokenizer: Any = None
+    Tekkenizer: Any = None
+    SentencePieceTokenizer: Any = None
+    _filter_valid_tokenizer_files: Any = None
 else:
     _mistral_common_installed = True
 
 try:
-    from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
+    from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found]
         get_one_valid_tokenizer_file,
     )
 except ImportError:
     # We still want the conversion to work with older mistral-common versions.
-    get_one_valid_tokenizer_file = None
+    get_one_valid_tokenizer_file: Any = None
 
 
 import gguf
@@ -703,7 +703,7 @@ class MistralVocab(Vocab):
 
             tokenizer_file_path = base_path / tokenizer_file
 
-        self.tokenizer = MistralTokenizer.from_file(
+        self.tokenizer: Any = MistralTokenizer.from_file(
             tokenizer_file_path
         ).instruct_tokenizer.tokenizer
         self.tokenizer_type = (
diff --git a/pyrightconfig.json b/pyrightconfig.json
index a7bc007bdc..14d84fdbe7 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,5 +1,5 @@
 {
-  "extraPaths": ["gguf-py", "examples/model-conversion/scripts"],
+  "extraPaths": ["gguf-py", "examples/model-conversion/scripts", "examples/model-conversion/scripts/utils"],
   "pythonVersion": "3.9",
   "pythonPlatform": "All",
   "reportUnusedImport": "warning",
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 14e75117c4..f43d24ebf1 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -684,6 +684,7 @@ else:
     sys.exit(1)
 
 
+assert isinstance(hexsha8_baseline, str)
 name_baseline = bench_data.get_commit_name(hexsha8_baseline)
 
 hexsha8_compare = name_compare = None
@@ -717,6 +718,7 @@ else:
     parser.print_help()
     sys.exit(1)
 
+assert isinstance(hexsha8_compare, str)
 name_compare = bench_data.get_commit_name(hexsha8_compare)
 
 # Get tool-specific configuration
diff --git a/scripts/jinja/jinja-tester.py b/scripts/jinja/jinja-tester.py
index a489305ee7..4f79b8da3d 100755
--- a/scripts/jinja/jinja-tester.py
+++ b/scripts/jinja/jinja-tester.py
@@ -241,10 +241,10 @@ class CodeEditor(QPlainTextEdit):
         if not self.isReadOnly():
             selection = QTextEdit.ExtraSelection()
             line_color = QColorConstants.Yellow.lighter(160)
-            selection.format.setBackground(line_color)  # pyright: ignore[reportAttributeAccessIssue]
-            selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True)  # pyright: ignore[reportAttributeAccessIssue]
-            selection.cursor = self.textCursor()  # pyright: ignore[reportAttributeAccessIssue]
-            selection.cursor.clearSelection()  # pyright: ignore[reportAttributeAccessIssue]
+            selection.format.setBackground(line_color)  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True)  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            selection.cursor = self.textCursor()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            selection.cursor.clearSelection()  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
             extra_selections.append(selection)
         self.setExtraSelections(extra_selections)
 
@@ -262,8 +262,8 @@ class CodeEditor(QPlainTextEdit):
                 )
 
             extra = QTextEdit.ExtraSelection()
-            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue]
-            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue]
+            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
 
             self.setExtraSelections(self.extraSelections() + [extra])
 
@@ -274,8 +274,8 @@ class CodeEditor(QPlainTextEdit):
             cursor.select(QTextCursor.SelectionType.LineUnderCursor)
 
             extra = QTextEdit.ExtraSelection()
-            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue]
-            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue]
+            extra.format.setBackground(color.lighter(160))  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
+            extra.cursor = cursor  # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
 
             self.setExtraSelections(self.extraSelections() + [extra])
 
@@ -395,8 +395,8 @@ class JinjaTester(QMainWindow):
                 ensure_ascii=ensure_ascii,
             )
         )
-        env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
-        env.globals["raise_exception"] = raise_exception
+        env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)  # ty: ignore[invalid-assignment]
+        env.globals["raise_exception"] = raise_exception  # ty: ignore[invalid-assignment]
         try:
             template = env.from_string(template_str)
             output = template.render(context)
diff --git a/scripts/server-bench.py b/scripts/server-bench.py
index 202c35a486..1b557a495a 100755
--- a/scripts/server-bench.py
+++ b/scripts/server-bench.py
@@ -189,6 +189,7 @@ def benchmark(
 
         data: list[dict] = []
 
+        assert isinstance(prompts, list)
         for i, p in enumerate(prompts):
             if seed_offset >= 0:
                 random.seed(3 * (seed_offset + 1000 * i) + 1)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8f25d47786..6aa73630c9 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
                         const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
                         const int32_t      seq_idx = ubatch.seq_idx[seq_id];
 
-                        embd_seq_out[seq_id].resize(n_embd);
-                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+                        // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
+                        // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
+                        const uint32_t n_embd_out = hparams.n_embd_out();
+                        embd_seq_out[seq_id].resize(n_embd_out);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
                     }
                 } break;
             case LLAMA_POOLING_TYPE_RANK:
@@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
                         // extract sequence embeddings (cleared before processing each batch)
                         auto & embd_seq_out = embd_seq;
 
+                        // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
+                        // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
+                        const uint32_t n_embd_out = hparams.n_embd_out();
+
                         for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
                             const llama_seq_id seq_id  = ubatch.seq_id_unq[s];
                             const int32_t      seq_idx = ubatch.seq_idx[seq_id];
 
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+                            embd_seq_out[seq_id].resize(n_embd_out);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_RANK:
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index aac0d41f2b..badcbfd0fb 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <algorithm>
 #include <cstdint>
+#include <set>
 #include <stdexcept>
 
 #define MAX_REPETITION_THRESHOLD 2000
@@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence(
         bool               is_nested) {
     size_t last_sym_start = rule.size();
     const char * pos = src;
+    uint64_t n_prev_rules = 1;
 
     // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
     // (though it's technically the same as -1 now)
@@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence(
         //            S'     ::= S |
 
         llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end());
+        // Calculate the total number of rules that will be generated by this repetition
+        uint64_t total_rules = 1; // Start with 1 for the original rule
+        if (!no_max && max_times > 0) {
+            total_rules = max_times;
+        } else if (min_times > 0) {
+            total_rules = min_times;
+        }
+
+        if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) {
+            throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity");
+        }
+
         if (min_times == 0) {
             rule.resize(last_sym_start);
         } else {
@@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence(
         if (n_opt > 0) {
             rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
         }
+        n_prev_rules *= total_rules;
+        GGML_ASSERT(n_prev_rules >= 1);
     };
 
     while (*pos) {
         if (*pos == '"') { // literal string
             pos++;
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             while (*pos != '"') {
                 if (!*pos) {
                     throw std::runtime_error("unexpected end of input");
@@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence(
                 start_type = LLAMA_GRETYPE_CHAR_NOT;
             }
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             while (*pos != ']') {
                 if (!*pos) {
                     throw std::runtime_error("unexpected end of input");
@@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence(
             auto token_pair = parse_token(vocab, pos);
             const char * token_end  = token_pair.second;
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             rule.push_back({type, token_pair.first});
             pos = parse_space(token_end, is_nested);
         } else if (is_word_char(*pos)) { // rule reference
@@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence(
             uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
             pos = parse_space(name_end, is_nested);
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
         } else if (*pos == '(') { // grouping
             // parse nested alternates into synthesized rule
             pos = parse_space(pos + 1, true);
+            uint32_t n_rules_before = symbol_ids.size();
             uint32_t sub_rule_id = generate_symbol_id(rule_name);
             pos = parse_alternates(pos, rule_name, sub_rule_id, true);
+            n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before);
             last_sym_start = rule.size();
             // output reference to synthesized rule
             rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
@@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence(
             pos = parse_space(pos + 1, is_nested);
         } else if (*pos == '.') { // any char
             last_sym_start = rule.size();
+            n_prev_rules = 1;
             rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
             pos = parse_space(pos + 1, is_nested);
         } else if (*pos == '*') {
@@ -830,32 +853,54 @@ static bool llama_grammar_match_token(
 static void llama_grammar_advance_stack(
         const llama_grammar_rules  & rules,
         const llama_grammar_stack  & stack,
-              llama_grammar_stacks & new_stacks) {
-    if (stack.empty()) {
-        if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
-            new_stacks.emplace_back(stack);
+        llama_grammar_stacks & new_stacks) {
+    std::vector<llama_grammar_stack> todo;
+    todo.push_back(stack);
+
+    auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) {
+        return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(),
+            [](const llama_grammar_element * pa, const llama_grammar_element * pb) {
+                return pa < pb;  // Compare pointer addresses
+            }
+        );
+    };
+
+    std::set<llama_grammar_stack, decltype(stack_cmp)> seen(stack_cmp);
+
+    while (!todo.empty()) {
+        llama_grammar_stack curr_stack = std::move(todo.back());
+        todo.pop_back();
+
+        if (seen.find( curr_stack) != seen.end()) {
+            continue;
         }
-        return;
-    }
+        seen.insert(curr_stack);
 
-    const llama_grammar_element * pos = stack.back();
+        if (curr_stack.empty()) {
+            if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
+                new_stacks.emplace_back(std::move(curr_stack));
+            }
+            continue;
+        }
 
-    switch (pos->type) {
+        const llama_grammar_element * pos = curr_stack.back();
+
+        switch (pos->type) {
         case LLAMA_GRETYPE_RULE_REF: {
             const size_t                  rule_id = static_cast<size_t>(pos->value);
             const llama_grammar_element * subpos  = rules[rule_id].data();
             do {
                 // init new stack without the top (pos)
-                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1);
                 if (!llama_grammar_is_end_of_sequence(pos + 1)) {
                     // if this rule ref is followed by another element, add that to stack
-                    new_stack.push_back(pos + 1);
+                    next_stack.push_back(pos + 1);
                 }
                 if (!llama_grammar_is_end_of_sequence(subpos)) {
                     // if alternate is nonempty, add to stack
-                    new_stack.push_back(subpos);
+                    next_stack.push_back(subpos);
                 }
-                llama_grammar_advance_stack(rules, new_stack, new_stacks);
+                todo.push_back(std::move(next_stack));
                 while (!llama_grammar_is_end_of_sequence(subpos)) {
                     // scan to end of alternate def
                     subpos++;
@@ -874,9 +919,9 @@ static void llama_grammar_advance_stack(
         case LLAMA_GRETYPE_CHAR_ANY:
         case LLAMA_GRETYPE_TOKEN:
         case LLAMA_GRETYPE_TOKEN_NOT:
-            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+            if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) {
                 // only add the stack if it's not a duplicate of one we already have
-                new_stacks.emplace_back(stack);
+                new_stacks.emplace_back(std::move(curr_stack));
             }
             break;
         default:
@@ -884,6 +929,7 @@ static void llama_grammar_advance_stack(
             // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
             // those
             GGML_ABORT("fatal error");
+        }
     }
 }
 
diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp
index 6abf71d6cf..0ba51ba235 100644
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@@ -22,6 +22,7 @@ static void test_calculate_diff_split_no_common(testing & t);
 static void test_calculate_diff_split_single_char(testing & t);
 static void test_calculate_diff_split_overlaps(testing & t);
 static void test_calculate_diff_split_tag_boundaries(testing & t);
+static void test_calculate_diff_split_generation_prompt(testing & t);
 static void test_calculate_diff_split(testing & t);
 
 static void test_until_common_prefix_basic(testing & t);
@@ -179,6 +180,7 @@ static void test_calculate_diff_split(testing & t) {
     t.test("calculate_diff_split single char", test_calculate_diff_split_single_char);
     t.test("calculate_diff_split overlaps", test_calculate_diff_split_overlaps);
     t.test("calculate_diff_split tag boundaries", test_calculate_diff_split_tag_boundaries);
+    t.test("calculate_diff_split generation prompt", test_calculate_diff_split_generation_prompt);
 }
 
 static void test_calculate_diff_split_basic(testing & t) {
@@ -502,6 +504,39 @@ static void test_calculate_diff_split_tag_boundaries(testing & t) {
     }
 }
 
+static void test_calculate_diff_split_generation_prompt(testing & t) {
+    // ChatML thinking template: left is a prefix of right, generation_prompt is the appended part.
+    // The trailing \n in left matches the trailing \n in the generation_prompt, causing
+    // the suffix matcher to steal it and rotate the diff result.
+    {
+        // Simplified reproduction: left ends with \n, right = left + "<|im_start|>assistant\n<think>\n"
+        std::string left  = "<|im_start|>user\nHello<|im_end|>\n";
+        std::string right = left + "<|im_start|>assistant\n<think>\n";
+        diff_split result = calculate_diff_split(left, right);
+        t.assert_equal("chatml prefix", left, result.prefix);
+        t.assert_equal("chatml left", "", result.left);
+        t.assert_equal("chatml right should be generation prompt",
+                       "<|im_start|>assistant\n<think>\n", result.right);
+        t.assert_equal("chatml suffix", "", result.suffix);
+    }
+
+    {
+        // More realistic: longer conversation ending with tool_response
+        std::string common =
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\nSearch for files<|im_end|>\n"
+            "<|im_start|>assistant\n<think>\nLet me search.\n</think>\n\n"
+            "<tool_call>\n<function=search>\n</function>\n</tool_call><|im_end|>\n"
+            "<|im_start|>user\n<tool_response>\nNo files found\n</tool_response><|im_end|>\n";
+        std::string left  = common;
+        std::string right = common + "<|im_start|>assistant\n<think>\n";
+        diff_split result = calculate_diff_split(left, right);
+        t.assert_equal("tool_response left", "", result.left);
+        t.assert_equal("tool_response right should be generation prompt",
+                       "<|im_start|>assistant\n<think>\n", result.right);
+    }
+}
+
 static void test_until_common_prefix(testing & t) {
     t.test("until_common_prefix basic", test_until_common_prefix_basic);
 }
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index faac9e7306..575d240791 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1337,7 +1337,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
         tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
             .enable_thinking(true)
             .reasoning_format(COMMON_REASONING_FORMAT_NONE)
-            .expect_content("<think>I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
+            .expect_content("<think>\nI'm\nthinking\n</think>\nHello, world!\nWhat's up?")
             .run();
 
         tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 526470a224..4d5d13dd0d 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -788,6 +788,24 @@ static void test_quantifiers() {
             "0xFF 0x12 0xAB 0x00 0x00 0x00",
         }
     );
+    test_grammar(
+        "segfault",
+        // Grammar
+        R"""(
+            root ::= ( [x]* )*
+        )""",
+        // Passing strings
+        {
+            "",
+            "x",
+            "xx"
+        },
+        // Failing strings
+        {
+            "y",
+            "yy"
+        }
+    );
 }
 
 static void test_failure_missing_root() {
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index 03ae78ff73..6abc43461b 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -145,6 +145,10 @@ int main()
         root ::= "a"{,}"
     )""");
 
+    verify_failure(R"""(
+        root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99}
+    )""");
+
     verify_failure(R"""(
         root ::= "a"{,10}"
     )""");
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index fd45d5ada8..25f432a2f5 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -123,25 +123,27 @@ int main()
 
     std::vector<std::vector<llama_grammar_element>> expected_stacks = {
         {
-            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_CHAR, 40},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
             {LLAMA_GRETYPE_CHAR, 61},
             {LLAMA_GRETYPE_RULE_REF, 7},
             {LLAMA_GRETYPE_CHAR, 97},
         },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
         {
             {LLAMA_GRETYPE_RULE_REF, 5},
             {LLAMA_GRETYPE_CHAR, 61},
@@ -149,26 +151,24 @@ int main()
             {LLAMA_GRETYPE_CHAR, 40},
         },
         {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
+            {LLAMA_GRETYPE_CHAR, 61},
+            {LLAMA_GRETYPE_RULE_REF, 7},
+            {LLAMA_GRETYPE_RULE_REF, 3},
+            {LLAMA_GRETYPE_CHAR, 48},
+        },
+        {
+            {LLAMA_GRETYPE_RULE_REF, 5},
             {LLAMA_GRETYPE_CHAR, 61},
             {LLAMA_GRETYPE_RULE_REF, 7},
             {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
         }};
 
     auto index = 0;
@@ -195,9 +195,9 @@ int main()
     }
 
     std::vector<llama_grammar_candidate> next_candidates;
-    next_candidates.resize(24);
+    next_candidates.resize(23);
 
-    for (size_t i = 0; i < 24; ++i)
+    for (size_t i = 0; i < 23; ++i)
     {
         uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
         cp[0] = 37 + i;
@@ -210,7 +210,6 @@ int main()
             {0, 37},
             {1, 38},
             {2, 39},
-            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -268,6 +267,7 @@ int main()
             {0, 37},
             {1, 38},
             {2, 39},
+            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -287,13 +287,11 @@ int main()
             {20, 57},
             {21, 58},
             {22, 59},
-            {23, 60},
         },
         {
             {0, 37},
             {1, 38},
             {2, 39},
-            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -351,6 +349,7 @@ int main()
             {0, 37},
             {1, 38},
             {2, 39},
+            {3, 40},
             {4, 41},
             {5, 42},
             {6, 43},
@@ -370,7 +369,6 @@ int main()
             {20, 57},
             {21, 58},
             {22, 59},
-            {23, 60},
         },
     };
 
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
index 93e697607e..25af4ee63b 100644
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -16,8 +16,7 @@ import random
 import unicodedata
 
 from pathlib import Path
-from typing import Any, Iterator, cast
-from typing_extensions import Buffer
+from typing import Any, Iterator
 
 import cffi
 from transformers import AutoTokenizer, PreTrainedTokenizer
@@ -114,7 +113,7 @@ class LibLlamaModel:
         while num < 0 and len(self.text_buff) < (16 << 20):
             self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
             num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
-        return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
+        return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType]
 
 
 class Tokenizer:
@@ -438,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
     decode_errors = 0
     MAX_ERRORS = 10
 
-    logger.info("%s: %s" % (generator.__qualname__, "ini"))
+    logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini"))
     for text in generator:
         # print(repr(text), text.encode())
         # print(repr(text), hex(ord(text[0])), text.encode())
@@ -477,7 +476,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl
             break
 
     t_total = time.perf_counter() - t_start
-    logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
+    logger.info(f"{getattr(generator, '__qualname__', '')}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
 
 
 def main(argv: list[str] | None = None):
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 22d3fc87e9..c344cab2a8 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -83,7 +83,7 @@
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
-| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index f868c2c7d7..b5eeba7334 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -166,7 +166,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
-| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 10bf6d8ac1..00c6536589 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -418,7 +418,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -m, --model <filename>                      (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
     printf("  -hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive\n");
     printf("                                              default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n");
-    printf("                                              example: unsloth/phi-4-GGUF:Q4_K_M\n");
+    printf("                                              example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n");
     printf("                                              (default: unused)\n");
     printf("  -hff, --hf-file <file>                      Hugging Face model file. If specified, it will override the quant in --hf-repo\n");
     printf("                                              (default: unused)\n");
diff --git a/tools/server/README.md b/tools/server/README.md
index df59e2d9b7..554444d74b 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -100,7 +100,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
-| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
+| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py
index 0c57a2df04..c816816eaf 100644
--- a/tools/server/bench/bench.py
+++ b/tools/server/bench/bench.py
@@ -285,7 +285,7 @@ def start_server_background(args):
     }
     server_process = subprocess.Popen(
         args,
-        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue] # ty: ignore[no-matching-overload]
 
     def server_log(in_stream, out_stream):
         for line in iter(in_stream.readline, b''):
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 39d232c2e4..7d543b9292 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -415,6 +415,7 @@ task_params server_task::params_from_json_cmpl(
         params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
         params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string());
         params.sampling.generation_prompt = params.chat_parser_params.generation_prompt;
+        SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str());
         params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         if (data.contains("chat_parser")) {
             params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py
index ba41cd44ea..b1a5ab9da4 100755
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@@ -9,6 +9,7 @@ sys.path.insert(0, str(path))
 
 from utils import *
 from enum import Enum
+from typing import TypedDict
 
 server: ServerProcess
 
@@ -29,56 +30,73 @@ class CompletionMode(Enum):
     NORMAL = "normal"
     STREAMED = "streamed"
 
-TEST_TOOL = {
-    "type":"function",
-    "function": {
-        "name": "test",
-        "description": "",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "success": {"type": "boolean", "const": True},
-            },
-            "required": ["success"]
-        }
-    }
-}
+class ToolParameters(TypedDict):
+    type: str
+    properties: dict[str, dict]
+    required: list[str]
 
-PYTHON_TOOL = {
-    "type": "function",
-    "function": {
-        "name": "python",
-        "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters": {
-            "type": "object",
-            "properties": {
+class ToolFunction(TypedDict):
+    name: str
+    description: str
+    parameters: ToolParameters
+
+class ToolDefinition(TypedDict):
+    type: str
+    function: ToolFunction
+
+TEST_TOOL = ToolDefinition(
+    type = "function",
+    function = ToolFunction(
+        name = "test",
+        description = "",
+        parameters = ToolParameters(
+            type = "object",
+            properties = {
+                "success": {
+                    "type": "boolean",
+                    "const": True,
+                },
+            },
+            required = ["success"],
+        ),
+    ),
+)
+
+PYTHON_TOOL = ToolDefinition(
+    type = "function",
+    function = ToolFunction(
+        name = "python",
+        description = "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        parameters = ToolParameters(
+            type = "object",
+            properties = {
                 "code": {
                     "type": "string",
-                    "description": "The code to run in the ipython interpreter."
-                }
+                    "description": "The code to run in the ipython interpreter.",
+                },
             },
-            "required": ["code"]
-        }
-    }
-}
+            required = ["code"],
+        ),
+    ),
+)
 
-WEATHER_TOOL = {
-  "type":"function",
-  "function":{
-    "name":"get_current_weather",
-    "description":"Get the current weather in a given location",
-    "parameters":{
-      "type":"object",
-      "properties":{
-        "location":{
-          "type":"string",
-          "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
-        }
-      },
-      "required":["location"]
-    }
-  }
-}
+WEATHER_TOOL = ToolDefinition(
+    type = "function",
+    function = ToolFunction(
+        name = "get_current_weather",
+        description = "Get the current weather in a given location",
+        parameters = ToolParameters(
+            type = "object",
+            properties = {
+                "location": {
+                    "type": "string",
+                    "description": "The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'",
+                },
+            },
+            required = ["location"],
+        ),
+    ),
+)
 
 def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
     body = server.make_any_request("POST", "/v1/chat/completions", data={
diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts
index 39aaf561bb..ae9dd3ce8f 100644
--- a/tools/server/webui/src/lib/constants/settings-config.ts
+++ b/tools/server/webui/src/lib/constants/settings-config.ts
@@ -127,7 +127,7 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
 	fullHeightCodeBlocks:
 		'Always display code blocks at their full natural height, overriding any height limits.',
 	showRawModelNames:
-		'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.',
+		'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.',
 	mcpServers:
 		'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.',
 	mcpServerUsageStats:
diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts
index a6d7d6572f..50c32034a6 100644
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -457,7 +457,7 @@ class ModelsStore {
 
 	/**
 	 * Select a model by its model name (used for syncing with conversation model)
-	 * @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest")
+	 * @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF")
 	 */
 	selectModelByName(modelName: string): void {
 		const option = this.models.find((model) => model.model === modelName);
diff --git a/ty.toml b/ty.toml
new file mode 100644
index 0000000000..bcd23db9b8
--- /dev/null
+++ b/ty.toml
@@ -0,0 +1,30 @@
+[environment]
+extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
+python-version = "3.10"
+
+[rules]
+deprecated = "warn"
+
+[src]
+exclude = [
+    "./tools/mtmd/legacy-models/**",
+]
+
+[[overrides]]
+include = [
+    "./tools/server/tests/**",
+]
+
+[overrides.rules]
+unresolved-reference = "ignore"
+unresolved-import = "ignore"
+unresolved-attribute = "ignore"
+
+[[overrides]]
+include = [
+    "./examples/pydantic_models_to_grammar.py",
+]
+
+[overrides.rules]
+unsupported-operator = "ignore"
+not-subscriptable = "ignore"