diff --git a/.github/workflows/ai-issues.yml b/.github/workflows/ai-issues.yml index caa74ca223..c762901b5a 100644 --- a/.github/workflows/ai-issues.yml +++ b/.github/workflows/ai-issues.yml @@ -26,7 +26,9 @@ jobs: { "bash": { "*": "deny", - "gh issue*": "allow", + "gh issue view*": "allow", + "gh issue list*": "allow", + "gh issue comment*": "allow", "gh search issues*": "allow" }, "webfetch": "deny" @@ -71,8 +73,8 @@ jobs: [comment] This issue might be similar or related to the following issue(s): - - #[related_issue_number]: [brief description of how they are related] - - #[related_issue_number]: [brief description of how they are related] + - #12942: [brief description of how they are related] + - #11234: [brief description of how they are related] ... _This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_ diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index e801a9f42e..2c62678726 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -4,15 +4,17 @@ on: push: paths: - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' + - 'ty.toml' - '**.py' - '**/requirements*.txt' + # - 'pyrightconfig.json' pull_request: paths: - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' + - 'ty.toml' - '**.py' - '**/requirements*.txt' + # - 'pyrightconfig.json' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -20,8 +22,8 @@ concurrency: jobs: python-type-check: - runs-on: ubuntu-latest - name: pyright type-check + runs-on: ubuntu-slim + name: python type-check steps: - name: Check out source repository uses: actions/checkout@v6 @@ -29,10 +31,13 @@ jobs: uses: actions/setup-python@v6 with: python-version: "3.11" - pip-install: -r requirements/requirements-all.txt - - name: Type-check with Pyright - uses: jakebailey/pyright-action@v2 - with: - version: 1.1.382 - level: warning - warnings: true + pip-install: -r requirements/requirements-all.txt ty==0.0.24 + # - name: Type-check with Pyright + # uses: jakebailey/pyright-action@v2 + # with: + # version: 1.1.382 + # level: warning + # warnings: true + - name: Type-check with ty + run: | + ty check --output-format=github diff --git a/AGENTS.md b/AGENTS.md index 117bed7f48..05a1edcb17 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -67,6 +67,7 @@ Examples of FORBIDDEN USAGE (and how to proceed): If a user asks one of the above, STOP IMMEDIATELY and ask them: +- Whether they acknowledge the risk of being permanently banned from contributing to the project - To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it - To search for relevant issues and create a new one if needed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 52898eef8a..8000b47186 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,6 +11,8 @@ The project differentiates between 3 levels of contributors: > [!IMPORTANT] > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity. > +> Repeated violations of this policy may result in your account being permanently banned from contributing to the project. +> > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file. Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations). @@ -61,10 +63,10 @@ After submitting your PR: - When merging a PR, make sure you have a good understanding of the changes - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you) -Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions: +Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions: - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone. - The pull request duplicates an existing one. -- The contributor fails to adhere to this contributing guide. +- The contributor fails to adhere to this contributing guide or the AI policy. # Coding guidelines diff --git a/common/arg.cpp b/common/arg.cpp index 98070d43e2..187bd00033 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2583,7 +2583,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" - "example: unsloth/phi-4-GGUF:q4_k_m\n" + "example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n" "(default: unused)", [](common_params & params, const std::string & value) { params.model.hf_repo = value; diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 4b827c9ae5..e35596b93f 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -348,6 +348,34 @@ void analyze_reasoning::compare_thinking_enabled() { mode = reasoning_mode::TAG_BASED; } } + } else if (!left_trimmed.empty() && !right_trimmed.empty()) { + // Full-output diff is noisy (e.g., SmolLM3 changes the system message when enable_thinking flips). + // Try to find reasoning markers by tail-anchoring: + // one output's generation prompt tail may appear in the other with extra reasoning markers appended. + const auto & output_A = comparison->output_A; + const auto & output_B = comparison->output_B; + const size_t anchor_len = 64; + + for (int dir = 0; dir < 2; dir++) { + const auto & base = dir == 0 ? output_B : output_A; + const auto & extended = dir == 0 ? output_A : output_B; + + size_t len = std::min(base.size(), anchor_len); + std::string anchor = base.substr(base.size() - len); + auto pos = extended.rfind(anchor); + if (pos == std::string::npos || pos + len >= extended.size()) continue; + + std::string extra = trim_whitespace(extended.substr(pos + len)); + if (extra.empty()) continue; + + auto seg = prune_whitespace_segments(segmentize_markers(extra)); + if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) { + if (start.empty()) start = seg[0].value; + if (end.empty()) end = seg[1].value; + mode = reasoning_mode::TAG_BASED; + break; + } + } } if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) { diff --git a/common/jinja/parser.cpp b/common/jinja/parser.cpp index 7970336ac0..4ae4477445 100644 --- a/common/jinja/parser.cpp +++ b/common/jinja/parser.cpp @@ -53,6 +53,13 @@ private: return tokens[current + offset]; } + const token & next() { + if (current >= tokens.size()) { + throw parser_exception("Parser Error: Unexpected EOF", source, tokens.empty() ? 0 : tokens.back().pos); + } + return tokens[current++]; + } + token expect(token::type type, const std::string& error) { const auto & t = peek(); if (t.t != type) { @@ -90,9 +97,9 @@ private: size_t start_pos = current; switch (peek().t) { case token::comment: - return mk_stmt(start_pos, tokens[current++].value); + return mk_stmt(start_pos, next().value); case token::text: - return mk_stmt(start_pos, tokens[current++].value); + return mk_stmt(start_pos, next().value); case token::open_statement: return parse_jinja_statement(); case token::open_expression: @@ -119,8 +126,7 @@ private: } size_t start_pos = current; - std::string name = peek().value; - current++; // consume identifier + std::string name = next().value; statement_ptr result; if (name == "set") { @@ -202,7 +208,7 @@ private: // Ignore generation blocks (transformers-specific) // See https://github.com/huggingface/transformers/pull/30650 for more information. result = mk_stmt(start_pos); - current++; + ++current; } else { throw std::runtime_error("Unknown statement: " + name); @@ -217,7 +223,7 @@ private: statements body; if (is(token::equals)) { - current++; + ++current; value = parse_expression_sequence(); } else { // parsing multiline set here @@ -280,7 +286,7 @@ private: exprs.push_back(primary ? parse_primary_expression() : parse_expression()); bool is_tuple = is(token::comma); while (is(token::comma)) { - current++; // consume comma + ++current; // consume comma exprs.push_back(primary ? parse_primary_expression() : parse_expression()); } return is_tuple ? mk_stmt(start_pos, std::move(exprs)) : std::move(exprs[0]); @@ -290,7 +296,7 @@ private: // e.g., `message` in `for message in messages` auto loop_var = parse_expression_sequence(true); // should be an identifier/tuple if (!is_identifier("in")) throw std::runtime_error("Expected 'in'"); - current++; + ++current; // consume 'in' // `messages` in `for message in messages` auto iterable = parse_expression(); @@ -305,7 +311,8 @@ private: } if (is_statement({"else"})) { - current += 2; + ++current; // consume {% + ++current; // consume 'else' expect(token::close_statement, "Expected %}"); while (!is_statement({"endfor"})) { alternate.push_back(parse_any()); @@ -347,7 +354,7 @@ private: auto left = parse_logical_and_expression(); while (is_identifier("or")) { size_t start_pos = current; - token op = tokens[current++]; + token op = next(); left = mk_stmt(start_pos, op, std::move(left), parse_logical_and_expression()); } return left; @@ -357,7 +364,7 @@ private: auto left = parse_logical_negation_expression(); while (is_identifier("and")) { size_t start_pos = current; - auto op = tokens[current++]; + auto op = next(); left = mk_stmt(start_pos, op, std::move(left), parse_logical_negation_expression()); } return left; @@ -367,7 +374,7 @@ private: // Try parse unary operators if (is_identifier("not")) { size_t start_pos = current; - auto op = tokens[current++]; + auto op = next(); return mk_stmt(start_pos, op, parse_logical_negation_expression()); } return parse_comparison_expression(); @@ -382,11 +389,12 @@ private: size_t start_pos = current; if (is_identifier("not") && peek(1).t == token::identifier && peek(1).value == "in") { op = {token::identifier, "not in", tokens[current].pos}; - current += 2; + ++current; // consume 'not' + ++current; // consume 'in' } else if (is_identifier("in")) { - op = tokens[current++]; + op = next(); } else if (is(token::comparison_binary_operator)) { - op = tokens[current++]; + op = next(); } else break; left = mk_stmt(start_pos, op, std::move(left), parse_additive_expression()); } @@ -397,7 +405,7 @@ private: auto left = parse_multiplicative_expression(); while (is(token::additive_binary_operator)) { size_t start_pos = current; - auto op = tokens[current++]; + auto op = next(); left = mk_stmt(start_pos, op, std::move(left), parse_multiplicative_expression()); } return left; @@ -407,7 +415,7 @@ private: auto left = parse_test_expression(); while (is(token::multiplicative_binary_operator)) { size_t start_pos = current; - auto op = tokens[current++]; + auto op = next(); left = mk_stmt(start_pos, op, std::move(left), parse_test_expression()); } return left; @@ -417,9 +425,9 @@ private: auto operand = parse_filter_expression(); while (is_identifier("is")) { size_t start_pos = current; - current++; + ++current; // consume 'is' bool negate = false; - if (is_identifier("not")) { current++; negate = true; } + if (is_identifier("not")) { ++current; negate = true; } auto test_id = parse_primary_expression(); // FIXME: tests can also be expressed like this: if x is eq 3 if (is(token::open_paren)) test_id = parse_call_expression(std::move(test_id)); @@ -432,7 +440,7 @@ private: auto operand = parse_call_member_expression(); while (is(token::pipe)) { size_t start_pos = current; - current++; + ++current; // consume pipe auto filter = parse_primary_expression(); if (is(token::open_paren)) filter = parse_call_expression(std::move(filter)); operand = mk_stmt(start_pos, std::move(operand), std::move(filter)); @@ -490,7 +498,7 @@ private: statement_ptr parse_member_expression(statement_ptr object) { size_t start_pos = current; while (is(token::dot) || is(token::open_square_bracket)) { - auto op = tokens[current++]; + auto op = next(); bool computed = op.t == token::open_square_bracket; statement_ptr prop; if (computed) { @@ -536,7 +544,7 @@ private: statement_ptr parse_primary_expression() { size_t start_pos = current; - auto t = tokens[current++]; + auto t = next(); switch (t.t) { case token::numeric_literal: if (t.value.find('.') != std::string::npos) { @@ -547,7 +555,7 @@ private: case token::string_literal: { std::string val = t.value; while (is(token::string_literal)) { - val += tokens[current++].value; + val += next().value; } return mk_stmt(start_pos, val); } @@ -562,9 +570,9 @@ private: statements vals; while (!is(token::close_square_bracket)) { vals.push_back(parse_expression()); - if (is(token::comma)) current++; + if (is(token::comma)) ++current; } - current++; + ++current; return mk_stmt(start_pos, std::move(vals)); } case token::open_curly_bracket: { @@ -573,9 +581,9 @@ private: auto key = parse_expression(); expect(token::colon, "Expected :"); pairs.push_back({std::move(key), parse_expression()}); - if (is(token::comma)) current++; + if (is(token::comma)) ++current; } - current++; + ++current; return mk_stmt(start_pos, std::move(pairs)); } default: diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8cfd0bf2f5..0cd47645d3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -31,10 +31,10 @@ import gguf from gguf.vocab import MistralTokenizerType, MistralVocab try: - from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found] SentencePieceTokenizer, ) @@ -45,9 +45,9 @@ except ImportError: _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) _mistral_common_installed = False - TokenizerVersion = None - Tekkenizer = None - SentencePieceTokenizer = None + TokenizerVersion: Any = None + Tekkenizer: Any = None + SentencePieceTokenizer: Any = None _mistral_import_error_msg = ( "Mistral format requires `mistral-common` to be installed. Please run " "`pip install mistral-common[image,audio]` to install it." @@ -145,6 +145,7 @@ class ModelBase: self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py self._is_nvfp4 = False + self._is_mxfp4 = False # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. @@ -220,7 +221,7 @@ class ModelBase: if weight_map is None or not isinstance(weight_map, dict): raise ValueError(f"Can't load 'weight_map' from {index_name!r}") tensor_names_from_index.update(weight_map.keys()) - part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) + part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) # ty: ignore[invalid-assignment] part_names = sorted(part_dict.keys()) else: weight_map = {} @@ -712,6 +713,7 @@ class ModelBase: def prepare_tensors(self): # detect NVFP4 quantization (ModelOpt format) quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo") + quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method") quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {} quant_config_file = self.dir_model / "hf_quant_config.json" @@ -728,6 +730,7 @@ class ModelBase: quant_algo = "NVFP4" self._is_nvfp4 = quant_algo == "NVFP4" + self._is_mxfp4 = quant_method == "mxfp4" # NVFP4 weights are repacked and written directly to gguf_writer. # This must run before dequant_model so NVFP4 tensors are removed @@ -876,6 +879,12 @@ class ModelBase: if self.metadata.name is None: self.metadata.name = self.dir_model.name + if self.ftype in (gguf.LlamaFileType.ALL_F32, gguf.LlamaFileType.MOSTLY_F16, gguf.LlamaFileType.MOSTLY_BF16): + if self._is_nvfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_NVFP4 + elif self._is_mxfp4: + self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE + # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) @@ -4264,6 +4273,16 @@ class Qwen25OmniModel(Qwen2VLVisionModel): @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): + + min_dynamic_tiles: int = 0 + max_dynamic_tiles: int = 0 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) + self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) + def set_gguf_parameters(self): assert self.hparams_vision is not None if isinstance(self.hparams_vision['image_size'], list): @@ -4286,6 +4305,11 @@ class InternVisionModel(MmprojModel): downsample_ratio = self.global_config.get("downsample_ratio") assert downsample_ratio is not None self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) + # older models may not have min/max_dynamic_patch in config + if self.min_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) + if self.max_dynamic_tiles > 0: + self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) def tensor_force_quant(self, name, new_name, bid, n_dims): if ".position_embd." in new_name: @@ -5882,7 +5906,7 @@ class InternLM2Model(TextModel): logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -6203,7 +6227,7 @@ class BertModel(TextModel): vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size) else: - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM @@ -8880,7 +8904,7 @@ class T5Model(TextModel): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram @@ -9017,7 +9041,7 @@ class T5EncoderModel(TextModel): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) # some models like Pile-T5 family use BPE tokenizer instead of Unigram @@ -11125,8 +11149,7 @@ class GptOssModel(TextModel): # TODO: remove once MXFP4 is supported more generally def dequant_model(self): - quant_config = self.hparams.get("quantization_config") - if quant_config is not None and quant_config.get("quant_method") == "mxfp4": + if self._is_mxfp4: return return super().dequant_model() @@ -12279,6 +12302,7 @@ class LazyTorchTensor(gguf.LazyBase): kwargs = {} if func is torch.Tensor.numpy: + assert len(args) return args[0].numpy() return cls._wrap_fn(func)(*args, **kwargs) diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 29b14e98dd..52827e6690 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -112,11 +112,11 @@ class Tensor: (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12]) assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}' assert name_len < 4096, 'Absurd tensor name length' - quant = gguf.GGML_QUANT_SIZES.get(dtype) + self.dtype = gguf.GGMLQuantizationType(dtype) + quant = gguf.GGML_QUANT_SIZES.get(self.dtype) assert quant is not None, 'Unknown tensor type' (blksize, tysize) = quant offset += 12 - self.dtype= gguf.GGMLQuantizationType(dtype) self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)]) offset += 4 * n_dims self.name = bytes(data[offset:offset + name_len]) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 871ce82422..ee98d0cf97 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -199,10 +199,13 @@ class LoraTorchTensor: kwargs = {} if func is torch.permute: + assert len(args) return type(args[0]).permute(*args, **kwargs) elif func is torch.reshape: + assert len(args) return type(args[0]).reshape(*args, **kwargs) elif func is torch.stack: + assert len(args) assert isinstance(args[0], Sequence) dim = kwargs.get("dim", 0) assert dim == 0 @@ -211,6 +214,7 @@ class LoraTorchTensor: torch.stack([b._lora_B for b in args[0]], dim), ) elif func is torch.cat: + assert len(args) assert isinstance(args[0], Sequence) dim = kwargs.get("dim", 0) assert dim == 0 @@ -362,7 +366,7 @@ if __name__ == '__main__': logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - class LoraModel(model_class): + class LoraModel(model_class): # ty: ignore[unsupported-base] model_arch = model_class.model_arch lora_alpha: float diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 018ba49b24..077fcfacac 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -28,9 +28,6 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): return f'({result})?' if min_items == 0 else result def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): - has_min = min_value != None - has_max = max_value != None - def digit_range(from_char: str, to_char: str): out.append("[") if from_char == to_char: @@ -106,7 +103,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou out.append(to_str[i]) out.append("]") - if has_min and has_max: + if min_value is not None and max_value is not None: if min_value < 0 and max_value < 0: out.append("\"-\" (") _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) @@ -133,7 +130,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou less_decimals = max(decimals_left - 1, 1) - if has_min: + if min_value is not None: if min_value < 0: out.append("\"-\" (") _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) @@ -177,7 +174,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou more_digits(length - 1, less_decimals) return - if has_max: + if max_value is not None: if max_value >= 0: if top_level: out.append("\"-\" [1-9] ") diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py index 0802cbcf4a..614c1a86b9 100755 --- a/examples/model-conversion/scripts/embedding/run-original-model.py +++ b/examples/model-conversion/scripts/embedding/run-original-model.py @@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device print("Using SentenceTransformer to apply all numbered layers") model = SentenceTransformer(model_path) tokenizer = model.tokenizer - config = model[0].auto_model.config # type: ignore + config = model[0].auto_model.config else: tokenizer = AutoTokenizer.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) @@ -108,8 +108,8 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device print(f"Model file: {type(model).__module__}") # Verify the model is using the correct sliding window - if hasattr(model.config, 'sliding_window'): # type: ignore - print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore + if hasattr(model.config, 'sliding_window'): + print(f"Model's sliding_window: {model.config.sliding_window}") else: print("Model config does not have sliding_window attribute") @@ -152,7 +152,7 @@ def main(): device = next(model.parameters()).device else: # For SentenceTransformer, get device from the underlying model - device = next(model[0].auto_model.parameters()).device # type: ignore + device = next(model[0].auto_model.parameters()).device model_name = os.path.basename(model_path) @@ -177,7 +177,7 @@ def main(): print(f"{token_id:6d} -> '{token_str}'") print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}") - print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore + print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") else: # Standard approach: use base model output only encoded = tokenizer( @@ -205,12 +205,12 @@ def main(): print(f"Embedding dimension: {all_embeddings.shape[1]}") if len(all_embeddings.shape) == 1: - n_embd = all_embeddings.shape[0] # type: ignore + n_embd = all_embeddings.shape[0] n_embd_count = 1 all_embeddings = all_embeddings.reshape(1, -1) else: - n_embd = all_embeddings.shape[1] # type: ignore - n_embd_count = all_embeddings.shape[0] # type: ignore + n_embd = all_embeddings.shape[1] + n_embd_count = all_embeddings.shape[0] print() diff --git a/examples/model-conversion/scripts/utils/compare_tokens.py b/examples/model-conversion/scripts/utils/compare_tokens.py index a286cb5683..62826ec7a6 100755 --- a/examples/model-conversion/scripts/utils/compare_tokens.py +++ b/examples/model-conversion/scripts/utils/compare_tokens.py @@ -2,7 +2,7 @@ import argparse import sys -from common import compare_tokens # type: ignore +from common import compare_tokens # type: ignore[import-not-found] def parse_arguments(): diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py index 93e5dcb6c3..0cdd0b5709 100644 --- a/examples/pydantic_models_to_grammar.py +++ b/examples/pydantic_models_to_grammar.py @@ -6,7 +6,7 @@ import re from copy import copy from enum import Enum from inspect import getdoc, isclass -from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin, get_type_hints from docstring_parser import parse from pydantic import BaseModel, create_model @@ -1158,7 +1158,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): # Assert that the parameter has a type annotation if param.annotation == inspect.Parameter.empty: - raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation") + raise TypeError(f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a type annotation""") # Find the parameter's description in the docstring param_doc = next((d for d in docstring.params if d.arg_name == param.name), None) @@ -1166,7 +1166,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): # Assert that the parameter has a description if not param_doc or not param_doc.description: raise ValueError( - f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring") + f"""Parameter '{param.name}' in function '{getattr(func, "__name__", "")}' lacks a description in the docstring""") # Add parameter details to the schema param_docs.append((param.name, param_doc)) @@ -1177,7 +1177,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]): dynamic_fields[param.name] = ( param.annotation if param.annotation != inspect.Parameter.empty else str, default_value) # Creating the dynamic model - dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) + dynamic_model = create_model(f"{getattr(func, '__name__')}", **dynamic_fields) for name, param_doc in param_docs: dynamic_model.model_fields[name].description = param_doc.description @@ -1285,7 +1285,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: if items != {}: array = {"properties": items} array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items") - fields[field_name] = (List[array_type], ...) + fields[field_name] = (list[array_type], ...) # ty: ignore[invalid-type-form] else: fields[field_name] = (list, ...) elif field_type == "object": diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index b45774dde3..adb4d68e86 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -3011,6 +3011,58 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { } } +void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + int sections[4]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + + GGML_TENSOR_UNARY_OP_LOCALS + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int) * 4); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); + + bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; + const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; + const bool is_vision = mode == GGML_ROPE_TYPE_VISION; + + if (is_imrope || mrope_used) { + is_neox = true; + } + + int64_t rope_dims = n_dims; + if (is_vision) { + rope_dims = src0->ne[0]; + } + + // Run the full cache init on the non-captured stream. This performs all + // host-to-device memcpy, aclrtMalloc/Free, and on-device computations + // so that the memory pool is warmed up and cache metadata is populated. + aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, + mrope_used, is_imrope, is_vision, rope_dims); + + // Reset `cached` so that during graph capture the on-device computations + // (sin/cos, position multiply, repeat, etc.) still execute and get recorded + // into the captured graph. The cache metadata (theta_scale_length, + // theta_scale, sections, position_length, etc.) remains set, which causes + // all host-to-device copy and malloc/free branches to be skipped. + ctx.rope_cache.cached = false; +} + void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 3effa1c289..7f5ba4d330 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -543,6 +543,21 @@ void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst); */ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst); +/** + * @brief Pre-load the RoPE cache before ACL graph capture. + * + * This function must be called outside of graph capture to perform + * host-to-device memory copies and device memory allocations that are + * not allowed on a captured stream. After pre-loading, the rope cache + * metadata is updated so that the subsequent call to + * aclnn_rope_cache_init (inside graph capture) skips these operations + * and only records the on-device computations into the captured graph. + * + * @param ctx CANN backend context. + * @param dst A ROPE destination tensor from the computation graph. + */ +void ggml_cann_rope_cache_preload(ggml_backend_cann_context & ctx, ggml_tensor * dst); + /** * @brief Computes the index of the maximum value along the specified dimension * of a ggml tensor using the CANN backend. diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 0120f0dfd1..5f960548cd 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -277,7 +277,7 @@ struct ggml_graph_node_properties { } } - if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) { + if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU || node->op == GGML_OP_ROPE){ return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0; } return true; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 2f9c350789..6f26e91e04 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2225,6 +2225,19 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, // If no matching graph is found, add a new ACL graph. ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph); cann_ctx->graph_lru_cache.push(new_graph); + + // Pre-load rope cache before graph capture. During capture the + // stream cannot perform host-to-device memcpy or device memory + // malloc/free. Running the full cache init now populates the + // cache metadata so these branches are skipped during capture, + // while also warming up the memory pool. + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_ROPE) { + ggml_cann_rope_cache_preload(*cann_ctx, node); + break; + } + } } } #else diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 262f88204e..419862101d 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -116,12 +116,11 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) else() - file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) - file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_CUDA ${SRCS}) + list(APPEND GGML_SOURCES_CUDA + template-instances/fattn-vec-instance-f16-f16.cu + template-instances/fattn-vec-instance-q4_0-q4_0.cu + template-instances/fattn-vec-instance-q8_0-q8_0.cu + template-instances/fattn-vec-instance-bf16-bf16.cu) endif() ggml_add_backend_library(ggml-cuda diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index 09f9a33f90..f5d37c7b99 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -41,6 +41,16 @@ template return __bfloat162float(x); } else if constexpr(std::is_same_v && std::is_same_v) { return __float22half2_rn(x); + } else if constexpr(std::is_same_v && std::is_same_v) { +#ifdef GGML_USE_HIP + return make_float2(__bfloat162float(__low2bfloat16(x)), __bfloat162float(__high2bfloat16(x))); +#else +#if __CUDA_ARCH__ >= 800 + return __bfloat1622float2(x); +#else + return make_float2(__bfloat162float(x.x), __bfloat162float(x.y)); +#endif // __CUDA_ARCH__ >= 800 +#endif // GGML_USE_HIP } else if constexpr(std::is_same_v && std::is_same_v) { // bypass compile error on cuda 12.0.1 #ifdef GGML_USE_HIP diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index e9abdf288c..c59a4db399 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -74,6 +74,37 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16( return sum; } +template +static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_bf16( + const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) { + + const nv_bfloat162 * K_bf16 = (const nv_bfloat162 *) K_c; + GGML_UNUSED(Q_q8); + GGML_UNUSED(Q_ds_v); + + constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes(); + constexpr int cpy_ne = cpy_nb / 4; + + float sum = 0.0f; + +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) { + __align__(16) nv_bfloat162 tmp[cpy_ne]; + ggml_cuda_memcpy_1(tmp, K_bf16 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne); +#pragma unroll + for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) { +#ifdef V_DOT2_F32_F16_AVAILABLE + // FIXME replace macros in vector FA kernel with templating and use FP32 for BF16 + ggml_cuda_mad(sum, ggml_cuda_cast(tmp[k_KQ_1]), __half22float2(((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1])); +#else + ggml_cuda_mad(sum, ggml_cuda_cast(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]); +#endif // V_DOT2_F32_F16_AVAILABLE + } + } + + return sum; +} + template static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0( const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) { @@ -321,6 +352,19 @@ static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict_ } } +template +static __device__ __forceinline__ void dequantize_V_bf16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) { + static_assert(std::is_same_v, "BF16 V dequantization only supports float output"); + static_assert(ne % 2 == 0, "bad ne"); + __align__(16) nv_bfloat162 tmp[ne/2]; + ggml_cuda_memcpy_1(tmp, (const nv_bfloat16 *) vx + i0); + float2 * dst_f2 = (float2 *) dst; +#pragma unroll + for (int l = 0; l < ne/2; ++l) { + dst_f2[l] = ggml_cuda_cast(tmp[l]); + } +} + template static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) { const block_q4_0 * x = (const block_q4_0 *) vx; @@ -547,6 +591,8 @@ constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() { return vec_dot_fattn_vec_KQ_q5_1; } else if constexpr (type_K == GGML_TYPE_Q8_0) { return vec_dot_fattn_vec_KQ_q8_0; + } else if constexpr (type_K == GGML_TYPE_BF16) { + return vec_dot_fattn_vec_KQ_bf16; } else { static_assert(type_K == -1, "bad type"); return nullptr; @@ -567,6 +613,8 @@ constexpr __device__ dequantize_V_t get_dequantize_V() { return dequantize_V_q5_1; } else if constexpr (type_V == GGML_TYPE_Q8_0) { return dequantize_V_q8_0; + } else if constexpr (type_V == GGML_TYPE_BF16) { + return dequantize_V_bf16; } else { static_assert(type_V == -1, "bad type"); return nullptr; diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 7cbe32633e..f0bd42a576 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -75,17 +75,17 @@ static __global__ void flash_attn_ext_vec( #endif // GGML_USE_HIP constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device(); - constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q; - constexpr int nthreads_V = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q; + constexpr int nthreads_KQ = (type_K == GGML_TYPE_F16 || type_K == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_KQ_q; + constexpr int nthreads_V = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 128 / cpy_nb : nthreads_V_q; static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K"); static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V"); - constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4; + constexpr int V_rows_per_thread = (type_V == GGML_TYPE_F16 || type_V == GGML_TYPE_BF16) ? 2*cpy_ne : 4; constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V; constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ(); - constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16; + constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16 && type_K != GGML_TYPE_BF16; #ifdef V_DOT2_F32_F16_AVAILABLE constexpr dequantize_V_t dequantize_V = get_dequantize_V(); #else @@ -323,8 +323,18 @@ static __global__ void flash_attn_ext_vec( #pragma unroll for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) { half2 tmp[V_rows_per_thread/2]; - dequantize_V(V + k*nb21, tmp, - 2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread); + if constexpr (type_V == GGML_TYPE_BF16) { + float2 tmp_f[V_rows_per_thread/2]; + dequantize_V(V + k*nb21, tmp_f, + 2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread); +#pragma unroll + for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) { + tmp[i_VKQ_1] = __float22half2_rn(tmp_f[i_VKQ_1]); + } + } else { + dequantize_V(V + k*nb21, tmp, + 2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread); + } #pragma unroll for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) { #pragma unroll @@ -563,6 +573,7 @@ void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_ten extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \ extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \ extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \ + extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_BF16); \ EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16) EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0) @@ -570,6 +581,7 @@ EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1) EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0) EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1) EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0) +EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_BF16) EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16) EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0) @@ -577,6 +589,7 @@ EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1) EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0) EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1) EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0) +EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_BF16) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0) @@ -584,3 +597,4 @@ EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1) EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0) +EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_BF16) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 85c177f496..a25a890db6 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -224,6 +224,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_F16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_F16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_F16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_F16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) @@ -231,6 +232,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_0) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_1) @@ -238,6 +240,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_1) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_0) @@ -245,6 +248,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_0) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_1) @@ -252,6 +256,7 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_1) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q8_0) @@ -259,10 +264,20 @@ static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_t FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_Q8_0) + + FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_BF16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_BF16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_BF16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_BF16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_BF16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_BF16) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16) #else FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) + FATTN_VEC_CASES_ALL_D(GGML_TYPE_BF16, GGML_TYPE_BF16) #endif // GGML_CUDA_FA_ALL_QUANTS GGML_ABORT("fatal error"); @@ -355,6 +370,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const #endif // GGML_CUDA_FA_ALL_QUANTS case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: + case GGML_TYPE_BF16: break; default: return BEST_FATTN_KERNEL_NONE; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 632246e43f..024b3d8cf2 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -33,7 +33,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) } } -static constexpr __device__ int get_vdr_mmvq(ggml_type type) { +static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ; case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ; @@ -173,11 +173,11 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d return 1; } -static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) { +static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) { if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) { switch (ncols_dst) { case 1: - return 1; + return small_k ? nwarps : 1; case 2: case 3: case 4: @@ -193,7 +193,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int return 1; } -template +template __launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst, @@ -208,7 +208,7 @@ static __global__ void mul_mat_vec_q( constexpr int vdr = get_vdr_mmvq(type); constexpr mmvq_parameter_table_id table_id = get_device_table_id(); constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id); - constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id); + constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id, small_k, nwarps); constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); @@ -414,14 +414,16 @@ static __global__ void mul_mat_vec_q( template static std::pair calc_launch_params( const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens, - const int warp_size, const mmvq_parameter_table_id table_id) { - const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id); + const int warp_size, const mmvq_parameter_table_id table_id, const bool small_k = false) { + const int nwarps = calc_nwarps(type, ncols_dst, table_id); + const int rpb = calc_rows_per_block(ncols_dst, table_id, small_k, nwarps); + const int64_t nblocks = (nrows_x + rpb - 1) / rpb; const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens); - const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1); + const dim3 block_dims(warp_size, nwarps, 1); return {block_nums, block_dims}; } -template +template static void mul_mat_vec_q_switch_fusion( const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y, @@ -434,7 +436,7 @@ static void mul_mat_vec_q_switch_fusion( const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (c_ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_q<<>> + mul_mat_vec_q<<>> (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); @@ -444,7 +446,7 @@ static void mul_mat_vec_q_switch_fusion( GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_q<<>> + mul_mat_vec_q<<>> (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); @@ -488,11 +490,33 @@ static void mul_mat_vec_q_switch_ncols_dst( switch (ncols_dst) { case 1: { constexpr int c_ncols_dst = 1; - std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, - channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, ids_stride, stream); + + // When K is small, increase rows_per_block to match nwarps so each warp has more work to do + // Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle. + constexpr int qk = ggml_cuda_type_traits::qk; + constexpr int qi = ggml_cuda_type_traits::qi; + constexpr int vdr = get_vdr_mmvq(type); + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_iter_1warp = vdr * warp_size / qi; + const int nwarps = calc_nwarps(type, c_ncols_dst, table_id); + const bool use_small_k = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp; + if (use_small_k) { + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, + warp_size, table_id, true); + mul_mat_vec_q_switch_fusion( + vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, + dims.first, dims.second, 0, ids_stride, stream); + } else { + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, + warp_size, table_id); + mul_mat_vec_q_switch_fusion( + vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, + dims.first, dims.second, 0, ids_stride, stream); + } } break; case 2: { constexpr int c_ncols_dst = 2; diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu new file mode 100644 index 0000000000..3a2fa99b05 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu new file mode 100644 index 0000000000..60f0f6f795 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-f16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_F16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_F16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_F16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu new file mode 100644 index 0000000000..489e05f08c --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_0.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q4_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q4_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q4_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu new file mode 100644 index 0000000000..6fa3c26d30 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q4_1.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q4_1); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q4_1); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q4_1); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu new file mode 100644 index 0000000000..421027fb29 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_0.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q5_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q5_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q5_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu new file mode 100644 index 0000000000..abbc943480 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q5_1.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q5_1); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q5_1); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q5_1); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu new file mode 100644 index 0000000000..d641f859d8 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-bf16-q8_0.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_BF16, GGML_TYPE_Q8_0); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_BF16, GGML_TYPE_Q8_0); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_BF16, GGML_TYPE_Q8_0); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu new file mode 100644 index 0000000000..d1071dc243 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_F16, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_F16, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_F16, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu new file mode 100644 index 0000000000..8afda31423 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_0, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_0, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu new file mode 100644 index 0000000000..506864ac18 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q4_1, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q4_1, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu new file mode 100644 index 0000000000..0bbda8371e --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_0, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_0, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu new file mode 100644 index 0000000000..79be24daf9 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q5_1, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q5_1, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu new file mode 100644 index 0000000000..45636e5e70 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-bf16.cu @@ -0,0 +1,7 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../fattn-vec.cuh" + +DECL_FATTN_VEC_CASE( 64, GGML_TYPE_Q8_0, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_BF16); +DECL_FATTN_VEC_CASE(256, GGML_TYPE_Q8_0, GGML_TYPE_BF16); diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py index e382df1ae2..3b5ab12fc4 100755 --- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py @@ -5,7 +5,7 @@ import os HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 576] -TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0"] +TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"] SOURCE_FATTN_TILE = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index f96c6e09a9..291b483745 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -71,12 +71,11 @@ if (GGML_CUDA_FA_ALL_QUANTS) list(APPEND GGML_SOURCES_ROCM ${SRCS}) add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) else() - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) + list(APPEND GGML_SOURCES_ROCM + ../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu) endif() ggml_add_backend_library(ggml-hip diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 72ad876d5e..9162342ee9 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1748,6 +1748,28 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d(ggml_met return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_CONV_3D); + + GGML_ASSERT(ggml_is_contiguous(op->src[0])); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_conv_3d_%s_%s", ggml_type_name(op->src[0]->type), ggml_type_name(op->src[1]->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_UPSCALE); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index fd2b3ddeb5..de43f81931 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -148,6 +148,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 82101f4714..14144aab08 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1077,6 +1077,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_CONV_3D: + return ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op->src[1]) && + (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) && + op->src[1]->type == GGML_TYPE_F32; case GGML_OP_SUM: return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]); case GGML_OP_TRI: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 53437b23cd..ea471090cd 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -643,6 +643,42 @@ typedef struct { int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources } ggml_metal_kargs_im2col; +typedef struct { + int32_t IW; + int32_t IH; + int32_t ID; + int32_t OW; + int32_t OH; + int32_t OD; + int32_t KW; + int32_t KH; + int32_t KD; + int32_t s0; + int32_t s1; + int32_t s2; + int32_t p0; + int32_t p1; + int32_t p2; + int32_t d0; + int32_t d1; + int32_t d2; + int32_t IC; + int32_t N; + int32_t OC; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; +} ggml_metal_kargs_conv_3d; + typedef struct{ int32_t ne00; uint64_t nb01; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index c0bcad392b..3cda21be43 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -394,6 +394,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_conv_transpose_2d(ctx, idx); } break; + case GGML_OP_CONV_3D: + { + n_fuse = ggml_metal_op_conv_3d(ctx, idx); + } break; case GGML_OP_UPSCALE: { n_fuse = ggml_metal_op_upscale(ctx, idx); @@ -3697,6 +3701,77 @@ int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_conv_3d(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + // 1. Extract standard dimensions and byte strides + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + + // 2. Extract hyperparams from op_params + const int32_t s0 = ((const int32_t *)(op->op_params))[0]; + const int32_t s1 = ((const int32_t *)(op->op_params))[1]; + const int32_t s2 = ((const int32_t *)(op->op_params))[2]; + const int32_t p0 = ((const int32_t *)(op->op_params))[3]; + const int32_t p1 = ((const int32_t *)(op->op_params))[4]; + const int32_t p2 = ((const int32_t *)(op->op_params))[5]; + const int32_t d0 = ((const int32_t *)(op->op_params))[6]; + const int32_t d1 = ((const int32_t *)(op->op_params))[7]; + const int32_t d2 = ((const int32_t *)(op->op_params))[8]; + const int32_t IC = ((const int32_t *)(op->op_params))[9]; + const int32_t N = ((const int32_t *)(op->op_params))[10]; + const int32_t OC = ((const int32_t *)(op->op_params))[11]; + + // 3. Build the parameter struct using the macro-generated variables + ggml_metal_kargs_conv_3d args = { + /*.IW =*/ (int32_t)op->src[1]->ne[0], + /*.IH =*/ (int32_t)op->src[1]->ne[1], + /*.ID =*/ (int32_t)op->src[1]->ne[2], + /*.OW =*/ (int32_t)op->ne[0], + /*.OH =*/ (int32_t)op->ne[1], + /*.OD =*/ (int32_t)op->ne[2], + /*.KW =*/ (int32_t)op->src[0]->ne[0], + /*.KH =*/ (int32_t)op->src[0]->ne[1], + /*.KD =*/ (int32_t)op->src[0]->ne[2], + s0, s1, s2, + p0, p1, p2, + d0, d1, d2, + IC, N, OC, + nb00, nb01, nb02, nb03, // Weight strides + nb10, nb11, nb12, nb13, // Input strides + nb0, nb1, nb2, nb3 // Output strides + }; + + // 4. Fetch the JIT pipeline + auto pipeline = ggml_metal_library_get_pipeline_conv_3d(lib, op); + + // 5. Grid mapping + int nth0 = 32; // Standard SIMD width for Apple Silicon + int nth1 = 1; + int nth2 = 1; + + int64_t spatial_volume = args.OW * args.OH * args.OD; + + int ntg0 = (spatial_volume + nth0 - 1) / nth0; + int ntg1 = args.OC; + int ntg2 = args.N; + + // 6. Bind and Dispatch via the ggml C wrapper + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_dispatch_threadgroups(enc, ntg0, ntg1, ntg2, nth0, nth1, nth2); + + return 1; +} + int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 019f2fec9e..50e3c5c77a 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -75,6 +75,7 @@ int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx); int ggml_metal_op_rope (ggml_metal_op_t ctx, int idx); int ggml_metal_op_im2col (ggml_metal_op_t ctx, int idx); int ggml_metal_op_conv_2d (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_conv_3d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_conv_transpose_1d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index b2328605dd..9c6b1c4f62 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4883,6 +4883,98 @@ kernel void kernel_upscale_bilinear_f32( } } +template +kernel void kernel_conv_3d( + constant ggml_metal_kargs_conv_3d & args, + device const char * src0, // Weights [IC * OC, KD, KH, KW] + device const char * src1, // Inputs [IC * N, ID, IH, IW] + device char * dst, // Outputs [OC * N, OD, OH, OW] + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]) { + + // 1. Un-flatten the spatial dimension from Grid X + int64_t spatial_idx = tgpig.x * 32 + tpitg.x; + + if (spatial_idx >= args.OW * args.OH * args.OD) { + return; // Thread falls outside the spatial volume + } + + int64_t od = spatial_idx / (args.OW * args.OH); + int64_t oh = (spatial_idx / args.OW) % args.OH; + int64_t ow = spatial_idx % args.OW; + + // 2. Map Y to Channels, Z to Batch + int64_t oc = tgpig.y; + int64_t batch_idx = tgpig.z; + + // 3. Calculate anchor coordinates in the Input volume + int64_t i_w_base = ow * args.s0 - args.p0; + int64_t i_h_base = oh * args.s1 - args.p1; + int64_t i_d_base = od * args.s2 - args.p2; + + float sum = 0.0f; + + // 4. Gather Loop (Iterate over Input Channels -> Depth -> Height -> Width) + for (int64_t ic = 0; ic < args.IC; ++ic) { + + // ggml packs batch and channel together in the 4th dimension + int64_t src_cn_idx = batch_idx * args.IC + ic; + int64_t w_cn_idx = oc * args.IC + ic; + + for (int64_t kz = 0; kz < args.KD; ++kz) { + int64_t id = i_d_base + kz * args.d2; + if (id < 0 || id >= args.ID) continue; // Boundary check (Padding) + + for (int64_t ky = 0; ky < args.KH; ++ky) { + int64_t ih = i_h_base + ky * args.d1; + if (ih < 0 || ih >= args.IH) continue; + + for (int64_t kx = 0; kx < args.KW; ++kx) { + int64_t iw = i_w_base + kx * args.d0; + if (iw < 0 || iw >= args.IW) continue; + + // Convert multi-dimensional coordinates to flat byte offsets + int64_t w_idx = kx*args.nb00 + ky*args.nb01 + kz*args.nb02 + w_cn_idx*args.nb03; + int64_t i_idx = iw*args.nb10 + ih*args.nb11 + id*args.nb12 + src_cn_idx*args.nb13; + + // Dereference memory and cast weights to f32 if they were f16 + float w_val = (float)*(device const T*)((device const char*)src0 + w_idx); + float i_val = *(device const float*)((device const char*)src1 + i_idx); + + sum += w_val * i_val; + } + } + } + } + + // 5. Write the accumulated value out to RAM + int64_t dst_cn_idx = batch_idx * args.OC + oc; + int64_t d_idx = ow*args.nb0 + oh*args.nb1 + od*args.nb2 + dst_cn_idx*args.nb3; + + *(device float*)(dst + d_idx) = sum; +} + +// Explicit instantiations so the JIT compiler can find them by name +template [[host_name("kernel_conv_3d_f32_f32")]] +kernel void kernel_conv_3d( + constant ggml_metal_kargs_conv_3d & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]); + +// Explicit instantiation for f16 weights +template [[host_name("kernel_conv_3d_f16_f32")]] +kernel void kernel_conv_3d( + constant ggml_metal_kargs_conv_3d & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]]); + + static inline float bicubic_weight1(float x) { const float a = -0.75f; return ((a + 2) * x - (a + 3)) * x * x + 1; diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index d76cb51977..cc53c812ce 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -48,12 +48,11 @@ if (MUSAToolkit_FOUND) list(APPEND GGML_SOURCES_MUSA ${SRCS}) add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) else() - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) - file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_MUSA ${SRCS}) + list(APPEND GGML_SOURCES_MUSA + ../ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu + ../ggml-cuda/template-instances/fattn-vec-instance-bf16-bf16.cu) endif() set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX) diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 1f8250934b..ae667b12d1 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -89,6 +89,7 @@ set(GGML_OPENCL_KERNELS mul_mv_q4_1_f32 mul_mv_q4_1_f32_flat mul_mv_q4_k_f32 + mul_mv_q4_k_f32_flat mul_mv_q6_k_f32 mul_mv_q6_k_f32_flat mul_mv_q8_0_f32 @@ -107,6 +108,7 @@ set(GGML_OPENCL_KERNELS mul_mm_q4_0_f32_l4_lm mul_mm_q4_1_f32_l4_lm mul_mm_q8_0_f32_l4_lm + mul_mm_q4_k_f32_l4_lm mul_mm_q6_k_f32_l4_lm mul_mm_q8_0_f32_8x4 gemv_noshuffle_q4_1_f32 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index e1dca6b4b4..c984e59b6b 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -534,11 +534,13 @@ struct ggml_backend_opencl_context { cl_kernel kernel_restore_block_q4_0_noshuffle; cl_kernel kernel_convert_block_q4_1_noshuffle; cl_kernel kernel_restore_block_q4_1_noshuffle; + cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K; cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K; cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; cl_kernel kernel_mul_mv_q4_1_f32; cl_kernel kernel_mul_mv_q4_1_f32_flat; cl_kernel kernel_mul_mv_q4_K_f32; + cl_kernel kernel_mul_mv_q4_K_f32_flat; cl_kernel kernel_mul_mv_q6_K_f32; cl_kernel kernel_mul_mv_q6_K_f32_flat; cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat; @@ -578,6 +580,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mm_q4_0_f32_l4_lm; cl_kernel kernel_mul_mm_q4_1_f32_l4_lm; cl_kernel kernel_mul_mm_q8_0_f32_l4_lm; + cl_kernel kernel_mul_mm_q4_k_f32_l4_lm; cl_kernel kernel_mul_mm_q6_k_f32_l4_lm; std::vector profiling_info; @@ -917,6 +920,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err)); GGML_LOG_CONT("."); @@ -1209,6 +1214,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mv_q4_k_f32_flat + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mv_q4_k_f32_flat.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mv_q4_k_f32_flat.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32_flat", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // mul_mv_q6_k_f32 { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -1482,6 +1504,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mm_q4_k_f32_l4_lm + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "mul_mm_q4_k_f32_l4_lm.cl.h" + }; +#else + const std::string kernel_src = read_file("mul_mm_q4_k_f32_l4_lm.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + + CL_CHECK((backend_ctx->kernel_mul_mm_q4_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q4_k_f32_l4_lm", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // mul_mm_q6_k_f32_l4_lm { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -3347,6 +3386,40 @@ struct ggml_tensor_extra_cl_q8_0 { } }; +struct ggml_tensor_extra_cl_q4_K { + // Quantized values + cl_mem q = nullptr; + // Scales for each super block. + cl_mem s = nullptr; + // Scales + cl_mem d = nullptr; + // Min + cl_mem dm = nullptr; + + ~ggml_tensor_extra_cl_q4_K() { + reset(); + } + + void reset() { + if (q != nullptr) { + CL_CHECK(clReleaseMemObject(q)); + q = nullptr; + } + if (s != nullptr) { + CL_CHECK(clReleaseMemObject(s)); + s = nullptr; + } + if (d != nullptr) { + CL_CHECK(clReleaseMemObject(d)); + d = nullptr; + } + if (dm != nullptr) { + CL_CHECK(clReleaseMemObject(dm)); + dm = nullptr; + } + } +}; + struct ggml_tensor_extra_cl_q6_K { // Lower 4 bits of quantized weights. cl_mem ql = nullptr; @@ -3956,6 +4029,12 @@ struct ggml_backend_opencl_buffer_context { for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) { delete e; } + for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K) { + delete e; + } + for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) { + delete e; + } for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) { delete e; } @@ -4039,6 +4118,21 @@ struct ggml_backend_opencl_buffer_context { return extra; } + ggml_tensor_extra_cl_q4_K * ggml_opencl_alloc_temp_tensor_extra_q4_K() { + ggml_tensor_extra_cl_q4_K * extra; + if (temp_tensor_extras_q4_K.empty()) { + extra = new ggml_tensor_extra_cl_q4_K(); + } else { + extra = temp_tensor_extras_q4_K.back(); + temp_tensor_extras_q4_K.pop_back(); + } + + temp_tensor_extras_q4_K_in_use.push_back(extra); + + extra->reset(); + return extra; + } + ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() { ggml_tensor_extra_cl_q6_K * extra; if (temp_tensor_extras_q6_K.empty()) { @@ -4080,6 +4174,11 @@ struct ggml_backend_opencl_buffer_context { } temp_tensor_extras_q8_0_in_use.clear(); + for (ggml_tensor_extra_cl_q4_K * e : temp_tensor_extras_q4_K_in_use) { + temp_tensor_extras_q4_K.push_back(e); + } + temp_tensor_extras_q4_K_in_use.clear(); + for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) { temp_tensor_extras_q6_K.push_back(e); } @@ -4101,6 +4200,8 @@ struct ggml_backend_opencl_buffer_context { std::vector temp_tensor_extras_mxfp4_in_use; std::vector temp_tensor_extras_q8_0; std::vector temp_tensor_extras_q8_0_in_use; + std::vector temp_tensor_extras_q4_K; + std::vector temp_tensor_extras_q4_K_in_use; std::vector temp_tensor_extras_q6_K; std::vector temp_tensor_extras_q6_K_in_use; @@ -4835,6 +4936,83 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, return; } + if (tensor->type == GGML_TYPE_Q4_K) { + ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; + GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); + + // Allocate the new extra and create aliases from the original. + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + ggml_tensor_extra_cl_q4_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_K(); + + size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); + size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(3 * ggml_blck_size(tensor->type) / 64); + size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; + GGML_ASSERT(size_d + size_dm + size_s + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + CL_CHECK(clEnqueueWriteBuffer( + queue, data_device, CL_TRUE, 0, + ggml_nbytes(tensor), data, 0, NULL, NULL)); + + cl_buffer_region region; + + // Create subbuffer for d. + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = size_d; + extra->d = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + auto previous_origin = region.origin; + + // Create subbuffer for mins. + region.origin = align_to(previous_origin + size_d, backend_ctx->alignment); + region.size = size_dm; + extra->dm = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; + + // Create subbuffer for s. + region.origin = align_to(previous_origin + size_dm, backend_ctx->alignment); + region.size = size_s; + extra->s = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + previous_origin = region.origin; + + // Create subbuffer for quants. + region.origin = align_to(previous_origin + size_s, backend_ctx->alignment); + region.size = size_q; + extra->q = clCreateSubBuffer( + extra_orig->data_device, CL_MEM_READ_WRITE, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + tensor->extra = extra; + return; + } if (tensor->type == GGML_TYPE_Q6_K) { ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); @@ -5245,6 +5423,34 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clReleaseMemObject(data_device)); return; } + if (tensor->type == GGML_TYPE_Q4_K) { + ggml_tensor_extra_cl_q4_K * extra = (ggml_tensor_extra_cl_q4_K *)tensor->extra; + + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (tensor->type == GGML_TYPE_Q6_K) { ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra; @@ -9357,6 +9563,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; + ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra; ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra; #endif @@ -10005,6 +10212,50 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); return; } + case GGML_TYPE_Q4_K: { + if (ne11 < 32) { + break; + } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) { + break; + } + + kernel = backend_ctx->kernel_mul_mm_q4_k_f32_l4_lm; + nth0 = 128; // calculated as (BM*BN)/(TM*TN) + + int batch_stride_a = ne00*ne01; + int batch_stride_b = ne10*ne11; + int batch_stride_d = ne0*ne1; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_K->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_K->s)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3)); + + // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. + size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + return; + } case GGML_TYPE_Q6_K: { if (ne11 < 32) { break; @@ -10449,6 +10700,43 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: { +#ifdef GGML_OPENCL_SOA_Q + kernel = backend_ctx->kernel_mul_mv_q4_K_f32_flat; + + if (backend_ctx->gpu_family == INTEL) { + nth0 = 16; + nth1 = 1; + ndst = 4; + } else if (backend_ctx->gpu_family == ADRENO) { + nth0 = 64; + nth1 = 2; + ndst = 16; + } else { + GGML_ASSERT(false && "TODO: Unknown GPU"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_K->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_K->s)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3)); +#else kernel = backend_ctx->kernel_mul_mv_q4_K_f32; if (backend_ctx->gpu_family == INTEL) { @@ -10482,6 +10770,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); +#endif // GGML_OPENCL_SOA_Q break; } case GGML_TYPE_Q5_K: diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 78ef9c177f..272d0ea23f 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -28,6 +28,7 @@ #define QK8_0 32 #define QR8_0 1 #define QK_K 256 +#define K_SCALE_SIZE (3 * QK_K / 64) #define K_QUANTS_PER_ITERATION 2 typedef char int8_t; @@ -55,6 +56,16 @@ struct block_q4_1 { uchar qs[QK4_1 / 2]; // nibbles / quants }; +//------------------------------------------------------------------------------ +// block_q4_k +//------------------------------------------------------------------------------ +struct block_q4_K { + half d; // delta + half dm; // min + uchar s[K_SCALE_SIZE]; + uchar q[QK_K / 2]; // nibbles / quants +}; + //------------------------------------------------------------------------------ // block_q6_K //------------------------------------------------------------------------------ @@ -408,6 +419,62 @@ kernel void kernel_restore_block_q8_0_trans( } } +//------------------------------------------------------------------------------ +// kernel_convert_block_q4_K +// Convert the block_q4_K format to 4 separate arrays (AOS -> SOA). +// This kernel does not deshuffle the bits. +// Each thread processes a super block. +//------------------------------------------------------------------------------ +kernel void kernel_convert_block_q4_K( + global struct block_q4_K * src0, + global uchar * dst_q, + global uchar * dst_s, + global half * dst_d, + global half * dst_dm +) { + global struct block_q4_K * b = (global struct block_q4_K *) src0 + get_global_id(0); + global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0); + global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE*get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + global half * dm = (global half *) dst_dm + get_global_id(0); + + *d = b->d; + *dm = b->dm; + + for (int i = 0; i < QK_K/2; ++i) { + q[i] = b->q[i]; + } + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s[i] = b->s[i]; + } +} + +// Restore block_q4_K from flattened arrays. +// Each thread processes a super block. +kernel void kernel_restore_block_q4_K( + global uchar * src_q, + global uchar * src_s, + global half * src_d, + global half * src_dm, + global struct block_q4_K * dst +) { + global struct block_q4_K * b = (global struct block_q4_K *) dst + get_global_id(0); + global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0); + global uchar * s = (global uchar *) src_s + K_SCALE_SIZE*get_global_id(0); + global half * d = (global half *) src_d + get_global_id(0); + global half * dm = (global half *) src_dm + get_global_id(0); + + b->d = *d; + b->dm = *dm; + + for (int i = 0; i < QK_K/2; ++i) { + b->q[i] = q[i]; + } + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s[i]; + } +} + //------------------------------------------------------------------------------ // kernel_convert_block_q6_K // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA). diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl new file mode 100644 index 0000000000..2235b1ae83 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q4_k_f32_l4_lm.cl @@ -0,0 +1,179 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define LOAD_VEC_A 4 +#define LOAD_VEC_B 4 + +#define BM 64 +#define BN 64 +#define BK 32 +#define TM 4 +#define TN 8 + +kernel void kernel_mul_mm_q4_k_f32_l4_lm( + global uchar4 * src0_q, + global uchar * src0_s, + global half * src0_d, + global half * src0_dm, + global float4 * src1, + ulong offset1, + global float * dst, + ulong offsetd, + + int ne00, + int ne01, + int ne02, + int ne11, + int ne12, + + int stride_a, + int stride_b, + int stride_d, + + int batch_stride_a, + int batch_stride_b, + int batch_stride_d, + + int r2, + int r3 +) { + src1 = (global float4*)((global char*)src1 + offset1); + dst = (global float *)((global char*)dst + offsetd); + + local float buf_a[BM * BK]; + local float buf_b[BN * BK]; + + const int batch_idx = get_global_id(2); + + const int i13 = batch_idx / ne12; + const int i12 = batch_idx % ne12; + + const int i03 = i13 / r3; + const int i02 = i12 / r2; + + const int batch_idx_a = i03 * ne02 + i02; + + const int ir = get_group_id(0); + const int ic = get_group_id(1); + + const int tid = get_local_id(0); + const int th_r = tid % (BM / TM); + const int th_c = tid / (BM / TM); + + const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A); + const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A); + const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B); + const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B); + + const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK; + const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK; + + int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A; + int pos_b = (batch_idx * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B; + + float sums[TM * TN]; + float cache_a[TM]; + float cache_b[TN]; + + for (int i = 0; i < TM * TN; i++) { + sums[i] = 0.0f; + } + + for (int block = 0; block < ne00; block += BK) { + for (int l = 0; l < BM; l += loadstride_a) { + if (ir*BM + loadc_a + l < ne01) { + int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; + int ib = idx / 64; + int iqs = (idx % 64) * 2; + + int n = iqs / 32; + int b = (iqs % 32) / 16; + int is = 2 * n + b; + int qsi = n * 32 + (iqs % 16) * 2; + + char * scales = src0_s + ib * 12; + + int scidx0 = (is < 4) ? is : (is + 4); + int scidx1 = (is < 4) ? is : (is - 4); + int scidxmask1 = (is < 4) ? 0x30 : 0xC0; + int scidxshift1 = (is < 4) ? 0 : 2; + int mbidx0 = is + 4; + int mbidx1 = (is < 4) ? is + 4 : is; + int mbidxmask0 = (is < 4) ? 0xF : 0xF0; + int mbidxshift0 = (is < 4) ? 0 : 4; + int mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + int mbidxshift1 = (is < 4) ? 0 : 2; + + uchar sc = (scales[scidx0] & 0xF) | ((scales[scidx1] & scidxmask1) >> scidxshift1); + uchar mbyte = ((scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((scales[mbidx1] & mbidxmask1) >> mbidxshift1); + + float d = (float)src0_d[ib] * (float)sc; + float m = -(float)src0_dm[ib] * (float)mbyte; + + global uchar4 * qs = src0_q + ib*32 + (qsi >> 2); + uchar4 q = *qs; + float4 v1 = (convert_float4((uchar4)((q.s0 >> (b * 4))&0x0F, (q.s1 >> (b * 4))&0x0F, (q.s2 >> (b * 4))&0x0F, (q.s3 >> (b * 4))&0x0F)))*d + m; + + buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = v1.s0; + buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = v1.s1; + buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = v1.s2; + buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = v1.s3; + } else { + buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = 0.0f; + buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = 0.0f; + } + } + + for (int l = 0; l < BN; l += loadstride_b) { + if (ic*BN + loadc_b + l < ne11) { + int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b; + buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0; + buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1; + buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2; + buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3; + } else { + buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f; + buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + pos_a += BK / LOAD_VEC_A; + pos_b += BK / LOAD_VEC_B; + + for (int i = 0; i < BK; i++) { + for (int j = 0; j < TM; j++) { + cache_a[j] = buf_a[(i) * BM + th_r * TM + j]; + } + + for (int j = 0; j < TN; j++) { + cache_b[j] = buf_b[(i) * BN + th_c * TN + j]; + } + + for (int cc = 0; cc < TN; cc++) { + for (int cr = 0; cr < TM; cr++) { + const int sums_idx = cc*TM + cr; + sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]); + } + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int dr = ir * BM + th_r * TM; + const int dc = ic * BN + th_c * TN; + + const int offsets = batch_idx * batch_stride_d; + + for (int cc = 0; cc < TN; cc++) { + for (int cr = 0; cr < TM; cr++) { + if (dr + cr < ne01 && dc + cc < ne11) { + dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr]; + } + } + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl new file mode 100644 index 0000000000..d92fb96890 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32_flat.cl @@ -0,0 +1,196 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#ifdef cl_intel_subgroups +#pragma OPENCL EXTENSION cl_intel_subgroups : enable +#else +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#endif + +#ifdef cl_intel_required_subgroup_size +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable +#define INTEL_GPU 1 +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) +#elif defined(cl_qcom_reqd_sub_group_size) +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +//------------------------------------------------------------------------------ +// block_q4_K +//------------------------------------------------------------------------------ +#define QK_K 256 +#define BLOCK_Q4K_SIZE 144 +#define K_SCALE_SIZE 12 + +// 8 blocks of 32 elements each +// weight is represented as x = a * q + b +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + + uchar scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uchar qs[QK_K/2]; // 4-bit quants +} block_q4_K; + +#undef N_DST +#undef N_SIMDGROUP +#undef N_SIMDWIDTH + +#ifdef INTEL_GPU +#define N_DST 4 // number of rows each SIMD group works on +#define N_SIMDGROUP 1 // number of SIMD groups in a thread group +#define N_SIMDWIDTH 16 // SIMD group size +#elif defined (ADRENO_GPU) +#define N_DST 16 +#define N_SIMDGROUP 2 +#define N_SIMDWIDTH 64 +#endif + +#undef BLOCK_STRIDE +// number of (super) blocks each subgroup processes +// each thread in a subgroup processes a block (32 weights) +#define BLOCK_STRIDE (N_SIMDWIDTH/8) + +#ifdef INTEL_GPU +REQD_SUBGROUP_SIZE_16 +#elif defined (ADRENO_GPU) +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mv_q4_K_f32_flat( + global uchar * src0_q, + global uchar * src0_s, + global half * src0_d, + global half * src0_dm, + global char * src1, + int offset1, + global char * dst, + int offsetd, + int ne00, + int ne01, + ulong nb01, + ulong nb02, + ulong nb03, + int ne12, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src1 = src1 + offset1; + dst = dst + offsetd; + + ushort kmask1 = 0x3f3f; + ushort kmask2 = 0x0f0f; + ushort kmask3 = 0xc0c0; + + int ix = get_sub_group_local_id()/8; + int it = get_sub_group_local_id()%8; + int iq = it/4; + int ir = it%4; + + int nb = ne00/QK_K; + + int r0 = get_group_id(0); + int r1 = get_group_id(1); + int im = get_group_id(2); + int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST; + + int i12 = im%ne12; + int i13 = im/ne12; + + int offset_src0 = (first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03)/BLOCK_Q4K_SIZE; + uint blk = nb01 / BLOCK_Q4K_SIZE; + global uchar * blk_q = (global uchar *)src0_q + offset_src0*(QK_K/2); + global uchar * blk_s = (global uchar *)src0_s + offset_src0*K_SCALE_SIZE; + global half * blk_d = (global half *)src0_d + offset_src0; + global half * blk_dm = (global half *)src0_dm + offset_src0; + + int offset_src1 = r1*nb11 + (i12)*nb12 + (i13)*nb13; + global float * y = (global float *)(src1 + offset_src1); + + float yl[16]; + float yh[16]; + float sumf[N_DST] = {0.f}; + float all_sum; + + global float * y4 = y + ix * QK_K + 64 * iq + 8 * ir; + + ushort sc16[4]; + uchar * sc8 = (uchar *)sc16; + + for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) { + float4 sumy = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+0] = y4[i+0]; + sumy.s0 += yl[i+0]; + + yl[i+8] = y4[i+32]; + sumy.s1 += yl[i+8]; + + yh[i+0] = y4[i+128]; + sumy.s2 += yh[i+0]; + + yh[i+8] = y4[i+160]; + sumy.s3 += yh[i+8]; + } + + global ushort * q1 = (global ushort *)(blk_q + ib * (QK_K/2)) + (16 * iq + 4 * ir); + global ushort * sc = (global ushort *)(blk_s + ib * K_SCALE_SIZE) + iq; + global half * d = blk_d + ib; + global half * dm = blk_dm + ib; + + for (int row = 0; row < N_DST; row++) { + sc16[0] = sc[0] & kmask1; + sc16[1] = sc[2] & kmask1; + sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2); + sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2); + + global ushort * q2 = q1 + 32; + + float4 acc1 = {0.f, 0.f, 0.f, 0.f}; + float4 acc2 = {0.f, 0.f, 0.f, 0.f}; + for (int i = 0; i < 8; i += 2) { + acc1.s0 += yl[i+0] * (q1[i/2] & 0x000F); + acc1.s1 += yl[i+1] * (q1[i/2] & 0x0F00); + acc1.s2 += yl[i+8] * (q1[i/2] & 0x00F0); + acc1.s3 += yl[i+9] * (q1[i/2] & 0xF000); + acc2.s0 += yh[i+0] * (q2[i/2] & 0x000F); + acc2.s1 += yh[i+1] * (q2[i/2] & 0x0F00); + acc2.s2 += yh[i+8] * (q2[i/2] & 0x00F0); + acc2.s3 += yh[i+9] * (q2[i/2] & 0xF000); + } + + float dall = *d; + float dmin = *dm; + sumf[row] += dall * ((acc1.s0 + 1.f/256.f * acc1.s1) * sc8[0] + + (acc1.s2 + 1.f/256.f * acc1.s3) * sc8[1] * 1.f/16.f + + (acc2.s0 + 1.f/256.f * acc2.s1) * sc8[4] + + (acc2.s2 + 1.f/256.f * acc2.s3) * sc8[5] * 1.f/16.f) - + dmin * (sumy.s0 * sc8[2] + sumy.s1 * sc8[3] + sumy.s2 * sc8[6] + sumy.s3 * sc8[7]); + + q1 += blk*64; + sc += blk*6; + d += blk; + dm += blk; + } + + y4 += BLOCK_STRIDE * QK_K; + } + + global float * dst_f32 = (global float *) dst + im*ne0*ne1 + r1*ne0; + + for (int row = 0; row < N_DST; ++row) { + all_sum = sub_group_reduce_add(sumf[row]); + if (first_row + row < ne01) { + if (get_sub_group_local_id() == 0) { + dst_f32[first_row + row] = all_sum; + } + } + } +} diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 0031cb7369..b3058b4af7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -97,6 +97,8 @@ struct ggml_backend_openvino_buffer_context { ov_buffer = std::make_shared(std::move(usm_tensor)); } else { data = ggml_aligned_malloc(size); + GGML_ASSERT(data); + memset(data, 0, size); ov_buffer = std::make_shared(ov::element::u8, ov::Shape{size}, data); } diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index d7c8ad8c16..5d8defad20 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1162,12 +1162,18 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp return nullptr; } + // Fix: Prevent division by zero if blck_size is 0 (e.g., deprecated types) + if (ggml_blck_size((enum ggml_type)tensor->type) == 0) { + GGML_LOG_ERROR("[%s] invalid tensor type received (blck_size is 0): %u\n", __func__, tensor->type); + return nullptr; + } + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type if (result == nullptr) { - GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type); + GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\n", __func__, tensor->type); return nullptr; } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 2ec1421841..456b1699fa 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4667,22 +4667,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g if (a->ne[3] != b->ne[3]) { return false; } - ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS || - a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M - ) { - if (b->ne[1] == 1 && ggml_nrows(b) > 1) { - return false; - } - } + ggml_type src0_type = op->src[0]->type; - if (src0_type == GGML_TYPE_BF16 ) { - // TODO: support GGML_TYPE_BF16 - // FIXME: keep a list of supported types to avoid breaking the backend when a new type is added - return false; - } // TODO: The configuration below needs more work to be supported with oneDNN if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 566958b3a9..221e6fa04e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -16048,6 +16048,7 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev) case 0xE20C: // B570 return 18; case 0xE20B: // B580 + case 0xE211: // Pro B60 return 20; default: return 0; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0a032e9039..9383644abf 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -301,6 +301,8 @@ class Keys: IMAGE_SIZE = "clip.vision.image_size" IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels" IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels" + PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles" + PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles" PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size" PATCH_SIZE = "clip.vision.patch_size" EMBEDDING_LENGTH = "clip.vision.embedding_length" @@ -3869,6 +3871,8 @@ class LlamaFileType(IntEnum): # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors + MOSTLY_MXFP4_MOE = 38 # except 1d tensors + MOSTLY_NVFP4 = 39 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 57f9fd1a52..010dfeea1c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1156,6 +1156,12 @@ class GGUFWriter: def add_vision_min_pixels(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value) + def add_vision_preproc_max_tiles(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PREPROC_MAX_TILES, value) + + def add_vision_preproc_min_tiles(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PREPROC_MIN_TILES, value) + def add_vision_preproc_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value) @@ -1300,7 +1306,7 @@ class GGUFWriter: else: raise ValueError("Invalid GGUF metadata value type or value") - return kv_data + return bytes(kv_data) @staticmethod def format_n_bytes_to_str(num: int) -> str: diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index c126f09c50..acbc79258a 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -138,7 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta): if isinstance(meta_noop, tuple): dtype, shape = meta_noop assert callable(shape) - res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) + res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape)) # ty: ignore[call-top-callable] else: res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 1cd519981a..1d9d9ab7d7 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -91,11 +91,11 @@ class __Quant(ABC): def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: cls.qtype = qtype cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] - cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__quantize_lazy: Any = LazyNumpyTensor._wrap_fn( cls.__quantize_array, meta_noop=(np.uint8, cls.__shape_to_bytes) ) - cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__dequantize_lazy: Any = LazyNumpyTensor._wrap_fn( cls.__dequantize_array, meta_noop=(np.float32, cls.__shape_from_bytes) ) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 028e5748e4..e4ab5e1e4b 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -11,33 +11,33 @@ from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVa try: from sentencepiece import SentencePieceProcessor except ImportError: - SentencePieceProcessor = None + SentencePieceProcessor: Any = None try: - from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] - from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found] + from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found] _filter_valid_tokenizer_files, ) - from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found] SentencePieceTokenizer, ) except ImportError: _mistral_common_installed = False - MistralTokenizer = None - Tekkenizer = None - SentencePieceTokenizer = None - _filter_valid_tokenizer_files = None + MistralTokenizer: Any = None + Tekkenizer: Any = None + SentencePieceTokenizer: Any = None + _filter_valid_tokenizer_files: Any = None else: _mistral_common_installed = True try: - from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.utils import ( # type: ignore[import-not-found] get_one_valid_tokenizer_file, ) except ImportError: # We still want the conversion to work with older mistral-common versions. - get_one_valid_tokenizer_file = None + get_one_valid_tokenizer_file: Any = None import gguf @@ -703,7 +703,7 @@ class MistralVocab(Vocab): tokenizer_file_path = base_path / tokenizer_file - self.tokenizer = MistralTokenizer.from_file( + self.tokenizer: Any = MistralTokenizer.from_file( tokenizer_file_path ).instruct_tokenizer.tokenizer self.tokenizer_type = ( diff --git a/models/templates/HuggingFaceTB-SmolLM3-3B.jinja b/models/templates/HuggingFaceTB-SmolLM3-3B.jinja new file mode 100644 index 0000000000..b605d93439 --- /dev/null +++ b/models/templates/HuggingFaceTB-SmolLM3-3B.jinja @@ -0,0 +1,61 @@ +{#- Copyright 2025-present the Unsloth team. All rights reserved. #} +{#- Licensed under the Apache License, Version 2.0 (the "License") #} +{#- Edits made by Unsloth to make it work for most inference engines #} +{# ───── defaults ───── #} +{%- if enable_thinking is not defined -%} + {%- set enable_thinking = true -%} +{%- endif -%} +{# ───── reasoning mode ───── #} +{%- if enable_thinking -%} + {%- set reasoning_mode = "/think" -%} +{%- else -%} + {%- set reasoning_mode = "/no_think" -%} +{%- endif -%} +{# ───── header (system message) ───── #} +{{- "<|im_start|>system\n" -}} +{%- if messages[0].role == "system" -%} + {%- set system_message = messages[0].content -%} + {%- if "/no_think" in system_message -%} + {%- set reasoning_mode = "/no_think" -%} + {%- elif "/think" in system_message -%} + {%- set reasoning_mode = "/think" -%} + {%- endif -%} + {%- set custom_instructions = system_message.replace("/no_think", "") -%} + {%- set custom_instructions = custom_instructions.replace("/think", "") -%} + {%- set custom_instructions = custom_instructions.rstrip() -%} +{%- endif -%} +{{- "## Metadata\n\n" -}} +{{- "Knowledge Cutoff Date: June 2025\n" -}} +{{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}} +{{- "## Custom Instructions\n\n" -}} +{%- if custom_instructions -%} + {{- custom_instructions + "\n\n" -}} +{%- elif reasoning_mode == "/think" -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: Thought section Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}} +{%- else -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}} +{%- endif -%} +{{- "<|im_end|>\n" -}} +{# ───── main loop ───── #} +{%- for message in messages -%} + {%- set content = message.content if message.content is string else "" -%} + {%- if message.role == "user" -%} + {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }} + {%- elif message.role == "assistant" -%} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- endif -%} + {%- elif message.role == "tool" -%} + {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }} + {%- endif -%} +{%- endfor -%} +{# ───── generation prompt ───── #} +{%- if add_generation_prompt -%} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" }} + {%- endif -%} +{%- endif -%} diff --git a/pyrightconfig.json b/pyrightconfig.json index a7bc007bdc..14d84fdbe7 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,5 +1,5 @@ { - "extraPaths": ["gguf-py", "examples/model-conversion/scripts"], + "extraPaths": ["gguf-py", "examples/model-conversion/scripts", "examples/model-conversion/scripts/utils"], "pythonVersion": "3.9", "pythonPlatform": "All", "reportUnusedImport": "warning", diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 14e75117c4..f43d24ebf1 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -684,6 +684,7 @@ else: sys.exit(1) +assert isinstance(hexsha8_baseline, str) name_baseline = bench_data.get_commit_name(hexsha8_baseline) hexsha8_compare = name_compare = None @@ -717,6 +718,7 @@ else: parser.print_help() sys.exit(1) +assert isinstance(hexsha8_compare, str) name_compare = bench_data.get_commit_name(hexsha8_compare) # Get tool-specific configuration diff --git a/scripts/jinja/jinja-tester.py b/scripts/jinja/jinja-tester.py index a489305ee7..4f79b8da3d 100755 --- a/scripts/jinja/jinja-tester.py +++ b/scripts/jinja/jinja-tester.py @@ -241,10 +241,10 @@ class CodeEditor(QPlainTextEdit): if not self.isReadOnly(): selection = QTextEdit.ExtraSelection() line_color = QColorConstants.Yellow.lighter(160) - selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] - selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] - selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] - selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] + selection.format.setBackground(line_color) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + selection.format.setProperty(QTextFormat.Property.FullWidthSelection, True) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + selection.cursor = self.textCursor() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + selection.cursor.clearSelection() # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] extra_selections.append(selection) self.setExtraSelections(extra_selections) @@ -262,8 +262,8 @@ class CodeEditor(QPlainTextEdit): ) extra = QTextEdit.ExtraSelection() - extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] - extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] + extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] self.setExtraSelections(self.extraSelections() + [extra]) @@ -274,8 +274,8 @@ class CodeEditor(QPlainTextEdit): cursor.select(QTextCursor.SelectionType.LineUnderCursor) extra = QTextEdit.ExtraSelection() - extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] - extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] + extra.format.setBackground(color.lighter(160)) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] + extra.cursor = cursor # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute] self.setExtraSelections(self.extraSelections() + [extra]) @@ -395,8 +395,8 @@ class JinjaTester(QMainWindow): ensure_ascii=ensure_ascii, ) ) - env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) - env.globals["raise_exception"] = raise_exception + env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment] + env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment] try: template = env.from_string(template_str) output = template.render(context) diff --git a/scripts/server-bench.py b/scripts/server-bench.py index 202c35a486..1b557a495a 100755 --- a/scripts/server-bench.py +++ b/scripts/server-bench.py @@ -189,6 +189,7 @@ def benchmark( data: list[dict] = [] + assert isinstance(prompts, list) for i, p in enumerate(prompts): if seed_offset >= 0: random.seed(3 * (seed_offset + 1000 * i) + 1) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8f25d47786..6aa73630c9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1347,8 +1347,11 @@ int llama_context::encode(const llama_batch & batch_inp) { const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const int32_t seq_idx = ubatch.seq_idx[seq_id]; - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); + // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's + // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl) + const uint32_t n_embd_out = hparams.n_embd_out(); + embd_seq_out[seq_id].resize(n_embd_out); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_RANK: @@ -1769,12 +1772,16 @@ int llama_context::decode(const llama_batch & batch_inp) { // extract sequence embeddings (cleared before processing each batch) auto & embd_seq_out = embd_seq; + // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's + // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl) + const uint32_t n_embd_out = hparams.n_embd_out(); + for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { const llama_seq_id seq_id = ubatch.seq_id_unq[s]; const int32_t seq_idx = ubatch.seq_idx[seq_id]; - embd_seq_out[seq_id].resize(n_embd); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float)); + embd_seq_out[seq_id].resize(n_embd_out); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_RANK: diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index aac0d41f2b..badcbfd0fb 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #define MAX_REPETITION_THRESHOLD 2000 @@ -454,6 +455,7 @@ const char * llama_grammar_parser::parse_sequence( bool is_nested) { size_t last_sym_start = rule.size(); const char * pos = src; + uint64_t n_prev_rules = 1; // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used // (though it's technically the same as -1 now) @@ -481,6 +483,18 @@ const char * llama_grammar_parser::parse_sequence( // S' ::= S | llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end()); + // Calculate the total number of rules that will be generated by this repetition + uint64_t total_rules = 1; // Start with 1 for the original rule + if (!no_max && max_times > 0) { + total_rules = max_times; + } else if (min_times > 0) { + total_rules = min_times; + } + + if (n_prev_rules * total_rules >= MAX_REPETITION_THRESHOLD) { + throw std::runtime_error("number of rules that are going to be repeated multiplied by the new repetition exceeds sane defaults, please reduce the number of repetitions or rule complexity"); + } + if (min_times == 0) { rule.resize(last_sym_start); } else { @@ -508,12 +522,15 @@ const char * llama_grammar_parser::parse_sequence( if (n_opt > 0) { rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); } + n_prev_rules *= total_rules; + GGML_ASSERT(n_prev_rules >= 1); }; while (*pos) { if (*pos == '"') { // literal string pos++; last_sym_start = rule.size(); + n_prev_rules = 1; while (*pos != '"') { if (!*pos) { throw std::runtime_error("unexpected end of input"); @@ -531,6 +548,7 @@ const char * llama_grammar_parser::parse_sequence( start_type = LLAMA_GRETYPE_CHAR_NOT; } last_sym_start = rule.size(); + n_prev_rules = 1; while (*pos != ']') { if (!*pos) { throw std::runtime_error("unexpected end of input"); @@ -561,6 +579,7 @@ const char * llama_grammar_parser::parse_sequence( auto token_pair = parse_token(vocab, pos); const char * token_end = token_pair.second; last_sym_start = rule.size(); + n_prev_rules = 1; rule.push_back({type, token_pair.first}); pos = parse_space(token_end, is_nested); } else if (is_word_char(*pos)) { // rule reference @@ -568,12 +587,15 @@ const char * llama_grammar_parser::parse_sequence( uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos); pos = parse_space(name_end, is_nested); last_sym_start = rule.size(); + n_prev_rules = 1; rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); } else if (*pos == '(') { // grouping // parse nested alternates into synthesized rule pos = parse_space(pos + 1, true); + uint32_t n_rules_before = symbol_ids.size(); uint32_t sub_rule_id = generate_symbol_id(rule_name); pos = parse_alternates(pos, rule_name, sub_rule_id, true); + n_prev_rules = std::max(1u, (uint32_t)symbol_ids.size() - n_rules_before); last_sym_start = rule.size(); // output reference to synthesized rule rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); @@ -583,6 +605,7 @@ const char * llama_grammar_parser::parse_sequence( pos = parse_space(pos + 1, is_nested); } else if (*pos == '.') { // any char last_sym_start = rule.size(); + n_prev_rules = 1; rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); pos = parse_space(pos + 1, is_nested); } else if (*pos == '*') { @@ -830,32 +853,54 @@ static bool llama_grammar_match_token( static void llama_grammar_advance_stack( const llama_grammar_rules & rules, const llama_grammar_stack & stack, - llama_grammar_stacks & new_stacks) { - if (stack.empty()) { - if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { - new_stacks.emplace_back(stack); + llama_grammar_stacks & new_stacks) { + std::vector todo; + todo.push_back(stack); + + auto stack_cmp = [](const llama_grammar_stack & a, const llama_grammar_stack & b) { + return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(), + [](const llama_grammar_element * pa, const llama_grammar_element * pb) { + return pa < pb; // Compare pointer addresses + } + ); + }; + + std::set seen(stack_cmp); + + while (!todo.empty()) { + llama_grammar_stack curr_stack = std::move(todo.back()); + todo.pop_back(); + + if (seen.find( curr_stack) != seen.end()) { + continue; } - return; - } + seen.insert(curr_stack); - const llama_grammar_element * pos = stack.back(); + if (curr_stack.empty()) { + if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) { + new_stacks.emplace_back(std::move(curr_stack)); + } + continue; + } - switch (pos->type) { + const llama_grammar_element * pos = curr_stack.back(); + + switch (pos->type) { case LLAMA_GRETYPE_RULE_REF: { const size_t rule_id = static_cast(pos->value); const llama_grammar_element * subpos = rules[rule_id].data(); do { // init new stack without the top (pos) - llama_grammar_stack new_stack(stack.begin(), stack.end() - 1); + llama_grammar_stack next_stack(curr_stack.begin(), curr_stack.end() - 1); if (!llama_grammar_is_end_of_sequence(pos + 1)) { // if this rule ref is followed by another element, add that to stack - new_stack.push_back(pos + 1); + next_stack.push_back(pos + 1); } if (!llama_grammar_is_end_of_sequence(subpos)) { // if alternate is nonempty, add to stack - new_stack.push_back(subpos); + next_stack.push_back(subpos); } - llama_grammar_advance_stack(rules, new_stack, new_stacks); + todo.push_back(std::move(next_stack)); while (!llama_grammar_is_end_of_sequence(subpos)) { // scan to end of alternate def subpos++; @@ -874,9 +919,9 @@ static void llama_grammar_advance_stack( case LLAMA_GRETYPE_CHAR_ANY: case LLAMA_GRETYPE_TOKEN: case LLAMA_GRETYPE_TOKEN_NOT: - if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { + if (std::find(new_stacks.begin(), new_stacks.end(), curr_stack) == new_stacks.end()) { // only add the stack if it's not a duplicate of one we already have - new_stacks.emplace_back(stack); + new_stacks.emplace_back(std::move(curr_stack)); } break; default: @@ -884,6 +929,7 @@ static void llama_grammar_advance_stack( // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // those GGML_ABORT("fatal error"); + } } } diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index 0ba51ba235..347ad94bd8 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -62,6 +62,9 @@ static void test_nemotron_tool_format(testing & t); static void test_cohere_reasoning_detection(testing & t); static void test_cohere_analysis(testing & t); +// SmolLM3 template analysis tests +static void test_smollm3_analysis(testing & t); + // Marker separation static void test_marker_separation(testing & t); @@ -96,6 +99,7 @@ int main(int argc, char * argv[]) { t.test("seed_oss_diffs", test_seed_oss_tool_analysis); t.test("cohere", test_cohere_analysis); t.test("nemotron", test_nemotron_analysis); + t.test("smollm3", test_smollm3_analysis); t.test("standard_json_tools", test_standard_json_tools_formats); t.test("normalize_quotes_to_json", test_normalize_quotes_to_json); t.test("tagged_args_embedded_quotes", test_tagged_args_with_embedded_quotes); @@ -1448,6 +1452,47 @@ static void test_tool_format_cohere(testing & t) { t.assert_true("tools_array_wrapped should be true", analysis.tools.format.tools_array_wrapped); } +// ============================================================================ +// SmolLM3 Template Analysis Tests +// Tests for templates that change system message when enable_thinking flips +// and prefill an empty block in no-think mode. +// ============================================================================ +static common_chat_template load_smollm3_template(testing & t) { + return load_template(t, "models/templates/HuggingFaceTB-SmolLM3-3B.jinja"); +} + +static void test_smollm3_reasoning_detection(testing & t); + +static void test_smollm3_analysis(testing & t) { + t.test("SmolLM3 reasoning detection", test_smollm3_reasoning_detection); +} + +static void test_smollm3_reasoning_detection(testing & t) { + common_chat_template tmpl = load_smollm3_template(t); + + // Run differential analysis + struct autoparser analysis; + analysis.analyze_template(tmpl); + + // SmolLM3 uses / reasoning tags. + // The template changes the entire system message when enable_thinking flips, + // so the analyzer must compare isolated generation prompts (not full outputs). + t.assert_equal("reasoning_start should be ''", "", analysis.reasoning.start); + t.assert_equal("reasoning_end should be ''", "", analysis.reasoning.end); + t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode); + + // Content should remain plain (no wrappers) + t.assert_equal("content start should be empty", "", analysis.content.start); + t.assert_equal("content end should be empty", "", analysis.content.end); + t.assert_equal("content should be PLAIN", content_mode::PLAIN, analysis.content.mode); + + // Preserved tokens should include the reasoning markers + bool has_think_start = std::find(analysis.preserved_tokens.begin(), analysis.preserved_tokens.end(), "") != analysis.preserved_tokens.end(); + bool has_think_end = std::find(analysis.preserved_tokens.begin(), analysis.preserved_tokens.end(), "") != analysis.preserved_tokens.end(); + t.assert_true("preserved_tokens should contain ''", has_think_start); + t.assert_true("preserved_tokens should contain ''", has_think_end); +} + // ============================================================================ // standard_json_tools Format Tests // ============================================================================ diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 526470a224..4d5d13dd0d 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -788,6 +788,24 @@ static void test_quantifiers() { "0xFF 0x12 0xAB 0x00 0x00 0x00", } ); + test_grammar( + "segfault", + // Grammar + R"""( + root ::= ( [x]* )* + )""", + // Passing strings + { + "", + "x", + "xx" + }, + // Failing strings + { + "y", + "yy" + } + ); } static void test_failure_missing_root() { diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp index 03ae78ff73..6abc43461b 100644 --- a/tests/test-grammar-parser.cpp +++ b/tests/test-grammar-parser.cpp @@ -145,6 +145,10 @@ int main() root ::= "a"{,}" )"""); + verify_failure(R"""( + root ::= (((((([^x]*){0,99}){0,99}){0,99}){0,99}){0,99}){0,99} + )"""); + verify_failure(R"""( root ::= "a"{,10}" )"""); diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index ef9c8f73c8..1550627bf0 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -2264,6 +2264,7 @@ static void test_fuzzing(testing & t) { t.test("malformed templates (should error, not crash)", [&](testing & t) { const std::vector malformed = { + "", "{{ x", "{% if %}", "{% for %}", @@ -2284,6 +2285,11 @@ static void test_fuzzing(testing & t) { for (const auto & tmpl : malformed) { t.assert_true("malformed: " + tmpl, fuzz_test_template(tmpl, json::object())); } + std::string tmpl = "{% for message in messages %}{{ message.role | string }} : {{ message.content if ('content' in message and message.content is not none) }}{% endfor %"; + while (tmpl.length() > 0) { + t.assert_true("malformed: " + tmpl, fuzz_test_template(tmpl, json::object())); + tmpl.pop_back(); + } }); t.test("type coercion edge cases", [&](testing & t) { diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index fd45d5ada8..25f432a2f5 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -123,25 +123,27 @@ int main() std::vector> expected_stacks = { { - {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_CHAR, 40}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { {LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_CHAR, 97}, }, - { - {LLAMA_GRETYPE_RULE_REF, 5}, - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, - { - {LLAMA_GRETYPE_RULE_REF, 5}, - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, { {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_CHAR, 61}, @@ -149,26 +151,24 @@ int main() {LLAMA_GRETYPE_CHAR, 40}, }, { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, + {LLAMA_GRETYPE_CHAR, 61}, + {LLAMA_GRETYPE_RULE_REF, 7}, + {LLAMA_GRETYPE_RULE_REF, 3}, + {LLAMA_GRETYPE_CHAR, 48}, + }, + { + {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_CHAR, 61}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_CHAR, 97}, - }, - { - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, - { - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_RULE_REF, 3}, - {LLAMA_GRETYPE_CHAR, 48}, - }, - { - {LLAMA_GRETYPE_CHAR, 61}, - {LLAMA_GRETYPE_RULE_REF, 7}, - {LLAMA_GRETYPE_CHAR, 40}, }}; auto index = 0; @@ -195,9 +195,9 @@ int main() } std::vector next_candidates; - next_candidates.resize(24); + next_candidates.resize(23); - for (size_t i = 0; i < 24; ++i) + for (size_t i = 0; i < 23; ++i) { uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point cp[0] = 37 + i; @@ -210,7 +210,6 @@ int main() {0, 37}, {1, 38}, {2, 39}, - {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -268,6 +267,7 @@ int main() {0, 37}, {1, 38}, {2, 39}, + {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -287,13 +287,11 @@ int main() {20, 57}, {21, 58}, {22, 59}, - {23, 60}, }, { {0, 37}, {1, 38}, {2, 39}, - {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -351,6 +349,7 @@ int main() {0, 37}, {1, 38}, {2, 39}, + {3, 40}, {4, 41}, {5, 42}, {6, 43}, @@ -370,7 +369,6 @@ int main() {20, 57}, {21, 58}, {22, 59}, - {23, 60}, }, }; diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 93e697607e..25af4ee63b 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -16,8 +16,7 @@ import random import unicodedata from pathlib import Path -from typing import Any, Iterator, cast -from typing_extensions import Buffer +from typing import Any, Iterator import cffi from transformers import AutoTokenizer, PreTrainedTokenizer @@ -114,7 +113,7 @@ class LibLlamaModel: while num < 0 and len(self.text_buff) < (16 << 20): self.text_buff = self.ffi.new("uint8_t[]", -2 * num) num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) - return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' + return str(self.ffi.buffer(self.text_buff, num), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' # pyright: ignore[reportArgumentType] class Tokenizer: @@ -438,7 +437,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl decode_errors = 0 MAX_ERRORS = 10 - logger.info("%s: %s" % (generator.__qualname__, "ini")) + logger.info("%s: %s" % (getattr(generator, "__qualname__", ""), "ini")) for text in generator: # print(repr(text), text.encode()) # print(repr(text), hex(ord(text[0])), text.encode()) @@ -477,7 +476,7 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl break t_total = time.perf_counter() - t_start - logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") + logger.info(f"{getattr(generator, '__qualname__', '')}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}") def main(argv: list[str] | None = None): diff --git a/tools/cli/README.md b/tools/cli/README.md index 22d3fc87e9..840976a884 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -83,7 +83,7 @@ | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | -| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | @@ -134,7 +134,7 @@ | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -147,7 +147,8 @@ | -------- | ----------- | | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | @@ -172,9 +173,12 @@ | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | diff --git a/tools/completion/README.md b/tools/completion/README.md index f868c2c7d7..25884ed92d 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -166,7 +166,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | -| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | @@ -217,7 +217,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -252,9 +252,12 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-gaw, --grp-attn-w N` | group-attention width (default: 512)
(env: LLAMA_ARG_GRP_ATTN_W) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index b0f1d6b936..21173576cc 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -418,7 +418,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive\n"); printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"); - printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n"); + printf(" example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M\n"); printf(" (default: unused)\n"); printf(" -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo\n"); printf(" (default: unused)\n"); diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 3eb66f9145..bf55cec7ef 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -38,6 +38,8 @@ #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" #define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles" +#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles" #define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index eeb8da58e0..265a17130f 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -42,6 +42,9 @@ struct clip_hparams { int32_t image_max_pixels = -1; int32_t n_merge = 0; // number of patch merges **per-side** + int32_t preproc_min_tiles = 0; + int32_t preproc_max_tiles = 0; + float image_mean[3]; float image_std[3]; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 44a19189ea..a47f1f495d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1138,6 +1138,16 @@ struct clip_model_loader { } } break; case PROJECTOR_TYPE_INTERNVL: + { + // older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues + hparams.preproc_min_tiles = 1; + hparams.preproc_max_tiles = 12; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + get_u32(KEY_PREPROC_MIN_TILES, hparams.preproc_min_tiles, false); + get_u32(KEY_PREPROC_MAX_TILES, hparams.preproc_max_tiles, false); + GGML_ASSERT(hparams.preproc_min_tiles <= hparams.preproc_max_tiles && hparams.preproc_max_tiles < INT32_MAX); + set_internvl_dhr_res_candidates(model); + } break; case PROJECTOR_TYPE_NEMOTRON_V2_VL: { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); @@ -1161,7 +1171,6 @@ struct clip_model_loader { hparams.set_warmup_n_tokens(16*16); } break; case PROJECTOR_TYPE_PIXTRAL: - case PROJECTOR_TYPE_LIGHTONOCR: { // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json // TODO: verify the image_min_tokens @@ -1171,6 +1180,15 @@ struct clip_model_loader { hparams.set_limit_image_tokens(8, 1024); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; + case PROJECTOR_TYPE_LIGHTONOCR: + { + hparams.n_merge = 1; + hparams.rope_theta = 10000.0f; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.image_longest_edge = hparams.image_size; + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; case PROJECTOR_TYPE_KIMIVL: { hparams.rope_theta = 10000.0f; @@ -2180,6 +2198,27 @@ struct clip_model_loader { } } } + + static void set_internvl_dhr_res_candidates(clip_model & model) { + auto & hparams = model.hparams; + int min_num = hparams.preproc_min_tiles; + int max_num = hparams.preproc_max_tiles; + if (min_num < 1) { + return; // avoid divide by 0 + } + for (int a = min_num; a <= max_num; ++a) { + int b_lo = (min_num + a - 1) / a; + int b_hi = max_num / a; + b_lo = std::max(b_lo, min_num); + b_hi = std::min(b_hi, max_num); + for (int b = b_lo; b <= b_hi; ++b) { + hparams.image_res_candidates.push_back(clip_image_size { + a*hparams.image_size, + b*hparams.image_size, + }); + } + } + } }; struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) { @@ -2726,17 +2765,22 @@ struct llava_uhd { return res; } - static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { + static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) { std::vector output; // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, inst.padding_overview, inst.pad_color_overview); - output.push_back(std::move(resized_img)); + if (overview_first) { + output.push_back(std::move(resized_img)); + } if (inst.slices.empty()) { // no slices, just return the resized image + if (!overview_first) { + output.push_back(std::move(resized_img)); + } return output; } @@ -2757,6 +2801,10 @@ struct llava_uhd { output.push_back(std::move(img_slice)); } + if (!overview_first) { + output.push_back(std::move(resized_img)); + } + return output; } @@ -3141,10 +3189,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_x = instructions.grid_size.width; res_imgs->grid_y = instructions.grid_size.height; } break; + case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution + { + GGML_ASSERT(!params.image_res_candidates.empty()); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst, false); + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + } break; case PROJECTOR_TYPE_GLM_EDGE: case PROJECTOR_TYPE_GEMMA3: - case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution case PROJECTOR_TYPE_NEMOTRON_V2_VL: { clip_image_u8 resized_image; @@ -3180,7 +3238,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_PIXTRAL: - case PROJECTOR_TYPE_LIGHTONOCR: { GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); clip_image_u8 resized_image; @@ -3196,6 +3253,19 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(img_f32)); } break; + case PROJECTOR_TYPE_LIGHTONOCR: + { + GGML_ASSERT(params.image_longest_edge > 0); + clip_image_u8 resized_image; + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.n_merge, + params.image_longest_edge); + img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BICUBIC); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; case PROJECTOR_TYPE_LLAMA4: { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f66c07345e..456ce7b73c 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -851,13 +851,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); return 1; } + auto proj_type = clip_get_projector_type(ctx_clip); int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); bool ok = false; if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) - || clip_is_glm(ctx_clip)) { + || clip_is_glm(ctx_clip) + || proj_type == PROJECTOR_TYPE_INTERNVL) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; for (size_t i = 0; i < entries.size(); i++) { diff --git a/tools/server/README.md b/tools/server/README.md index fbf6ec67a9..cb53678416 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -100,7 +100,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | -| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: ggml-org/GLM-4.7-Flash-GGUF:Q4_K_M
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | | `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | @@ -151,7 +151,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -164,7 +164,8 @@ For the full list of features, please refer to [server's changelog](https://gith | -------- | ----------- | | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | @@ -192,6 +193,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | | `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | +| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)
specify "all" to enable all tools
available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff
(env: LLAMA_ARG_TOOLS) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | @@ -215,11 +218,12 @@ For the full list of features, please refer to [server's changelog](https://gith | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)
(env: LLAMA_ARG_MODELS_AUTOLOAD) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `-rea, --resoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | @@ -234,7 +238,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | | `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | | `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) | @@ -290,6 +294,12 @@ It is currently available in the following endpoints: For more details, please refer to [multimodal documentation](../../docs/multimodal.md) +### Built-in tools support + +The server includes a set of built-in tools that enable the LLM to access the local file system directly from the Web UI. + +To use this feature, start the server with `--tools all`. You can also enable only specific tools by passing a comma-separated list: `--tools name1,name2,...`. Run `--help` for the full list of available tool names. + ## Build `llama-server` is built alongside everything else from the root of the project @@ -1642,6 +1652,13 @@ The `status` object can be: } ``` +```json +"status": { + "value": "sleeping", + "args": ["llama-server", "-ctx", "4096"] +} +``` + ### POST `/models/load`: Load a model Load a model diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py index 0c57a2df04..c816816eaf 100644 --- a/tools/server/bench/bench.py +++ b/tools/server/bench/bench.py @@ -285,7 +285,7 @@ def start_server_background(args): } server_process = subprocess.Popen( args, - **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] + **pkwargs) # pyright: ignore[reportArgumentType, reportCallIssue] # ty: ignore[no-matching-overload] def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9de554e900..b79a5270b5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3033,6 +3033,9 @@ struct server_res_generator : server_http_res { } }; +void server_context::on_sleeping_changed(std::function callback) { + impl->queue_tasks.on_sleeping_state(std::move(callback)); +} // diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 75f3d2de56..a4d2201cbe 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -74,6 +74,10 @@ struct server_context { // get server metadata (read-only), can only be called after load_model() // not thread-safe, should only be used from the main thread server_context_meta get_meta() const; + + // register a callback to be called when sleeping state changes + // must be set before load_model() is called + void on_sleeping_changed(std::function callback); }; diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 129022a711..3466512d0c 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -227,11 +227,17 @@ bool server_http_context::init(const common_params & params) { int n_threads_http = params.n_threads_http; if (n_threads_http < 1) { - // +2 threads for monitoring endpoints - n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); + // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future + n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1); } LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http); - srv->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); }; + srv->new_task_queue = [n_threads_http] { + // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads + // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request + // ref: https://github.com/yhirose/cpp-httplib/pull/2368 + size_t max_threads = (size_t)n_threads_http + 1024; + return new httplib::ThreadPool(n_threads_http, max_threads); + }; // // Web UI setup diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 4ac55cd158..7e61844f08 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -39,7 +39,8 @@ extern char **environ; #define DEFAULT_STOP_TIMEOUT 10 // seconds #define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit" -#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" +#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep +#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep" // address for child process, this is needed because router may run on 0.0.0.0 // ref: https://github.com/ggml-org/llama.cpp/issues/17862 @@ -380,7 +381,7 @@ void server_models::update_meta(const std::string & name, const server_model_met if (it != mapping.end()) { it->second.meta = meta; } - cv.notify_all(); // notify wait_until_loaded + cv.notify_all(); // notify wait_until_loading_finished } bool server_models::has_model(const std::string & name) { @@ -503,7 +504,7 @@ void server_models::unload_lru() { { std::unique_lock lk(mutex); for (const auto & m : mapping) { - if (m.second.meta.is_active()) { + if (m.second.meta.is_running()) { count_active++; if (m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; @@ -546,7 +547,7 @@ void server_models::load(const std::string & name) { if (base_params.models_max > 0) { size_t count_active = 0; for (const auto & m : mapping) { - if (m.second.meta.is_active()) { + if (m.second.meta.is_running()) { count_active++; } } @@ -605,15 +606,15 @@ void server_models::load(const std::string & name) { std::thread log_thread([&]() { // read stdout/stderr and forward to main server log // also handle status report from child process - bool state_received = false; // true if child state received if (stdout_file) { char buffer[4096]; while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); - if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) { - // child process is ready + std::string str(buffer); + if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); - state_received = true; + } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { + this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); } } } else { @@ -706,13 +707,13 @@ void server_models::unload(const std::string & name) { std::lock_guard lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { - if (it->second.meta.is_active()) { - SRV_INF("unloading model instance name=%s\n", name.c_str()); + if (it->second.meta.is_running()) { + SRV_INF("stopping model instance name=%s\n", name.c_str()); stopping_models.insert(name); cv_stop.notify_all(); // status change will be handled by the managing thread } else { - SRV_WRN("model instance name=%s is not loaded\n", name.c_str()); + SRV_WRN("model instance name=%s is not running\n", name.c_str()); } } } @@ -722,8 +723,8 @@ void server_models::unload_all() { { std::lock_guard lk(mutex); for (auto & [name, inst] : mapping) { - if (inst.meta.is_active()) { - SRV_INF("unloading model instance name=%s\n", name.c_str()); + if (inst.meta.is_running()) { + SRV_INF("stopping model instance name=%s\n", name.c_str()); stopping_models.insert(name); cv_stop.notify_all(); // status change will be handled by the managing thread @@ -750,7 +751,7 @@ void server_models::update_status(const std::string & name, server_model_status cv.notify_all(); } -void server_models::wait_until_loaded(const std::string & name) { +void server_models::wait_until_loading_finished(const std::string & name) { std::unique_lock lk(mutex); cv.wait(lk, [this, &name]() { auto it = mapping.find(name); @@ -761,22 +762,25 @@ void server_models::wait_until_loaded(const std::string & name) { }); } -bool server_models::ensure_model_loaded(const std::string & name) { +bool server_models::ensure_model_ready(const std::string & name) { auto meta = get_meta(name); if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); } - if (meta->status == SERVER_MODEL_STATUS_LOADED) { - return false; // already loaded + if (meta->is_ready()) { + return false; // ready for taking requests + } + if (meta->status == SERVER_MODEL_STATUS_SLEEPING) { + return false; // child is sleeping but still running; new request will wake it up } if (meta->status == SERVER_MODEL_STATUS_UNLOADED) { SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); load(name); } - // for loading state + // wait for loading to complete SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str()); - wait_until_loaded(name); + wait_until_loading_finished(name); // check final status meta = get_meta(name); @@ -792,8 +796,8 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); } - if (meta->status != SERVER_MODEL_STATUS_LOADED) { - throw std::invalid_argument("model name=" + name + " is not loaded"); + if (!meta->is_running()) { + throw std::invalid_argument("model name=" + name + " is not running"); } if (update_last_used) { std::unique_lock lk(mutex); @@ -819,6 +823,11 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co return proxy; } +bool server_models::is_child_server() { + const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); + return router_port != nullptr; +} + std::thread server_models::setup_child_server(const std::function & shutdown_handler) { // send a notification to the router server that a model instance is ready common_log_pause(common_log_main()); @@ -852,6 +861,13 @@ std::thread server_models::setup_child_server(const std::function & s }); } +void server_models::notify_router_sleeping_state(bool is_sleeping) { + common_log_pause(common_log_main()); + fflush(stdout); + fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY); + fflush(stdout); + common_log_resume(common_log_main()); +} // @@ -881,9 +897,9 @@ static bool router_validate_model(std::string & name, server_models & models, bo // resolve alias to canonical model name name = meta->name; if (models_autoload) { - models.ensure_model_loaded(name); + models.ensure_model_ready(name); } else { - if (meta->status != SERVER_MODEL_STATUS_LOADED) { + if (!meta->is_running()) { res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return false; } @@ -956,8 +972,8 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } - if (meta->status == SERVER_MODEL_STATUS_LOADED) { - res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); + if (meta->is_running()) { + res_err(res, format_error_response("model is already running", ERROR_TYPE_INVALID_REQUEST)); return res; } models.load(meta->name); @@ -1015,8 +1031,8 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); return res; } - if (!model->is_active()) { - res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + if (!model->is_running()) { + res_err(res, format_error_response("model is not running", ERROR_TYPE_INVALID_REQUEST)); return res; } models.unload(model->name); @@ -1181,7 +1197,8 @@ server_http_proxy::server_http_proxy( continue; } if (key == "Host" || key == "host") { - req.set_header(key, host); + bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80); + req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port)); } else { req.set_header(key, value); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2b392f299a..1db34b6c4d 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -14,17 +14,18 @@ /** * state diagram: * - * UNLOADED ──► LOADING ──► LOADED - * ▲ │ │ - * └───failed───┘ │ - * ▲ │ + * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING + * ▲ │ │ ▲ + * └───failed───┘ │ │ + * ▲ └──sleeping─────┘ * └────────unloaded─────────┘ */ enum server_model_status { // TODO: also add downloading state when the logic is added SERVER_MODEL_STATUS_UNLOADED, SERVER_MODEL_STATUS_LOADING, - SERVER_MODEL_STATUS_LOADED + SERVER_MODEL_STATUS_LOADED, + SERVER_MODEL_STATUS_SLEEPING }; static server_model_status server_model_status_from_string(const std::string & status_str) { @@ -37,6 +38,9 @@ static server_model_status server_model_status_from_string(const std::string & s if (status_str == "loaded") { return SERVER_MODEL_STATUS_LOADED; } + if (status_str == "sleeping") { + return SERVER_MODEL_STATUS_SLEEPING; + } throw std::runtime_error("invalid server model status"); } @@ -45,6 +49,7 @@ static std::string server_model_status_to_string(server_model_status status) { case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; case SERVER_MODEL_STATUS_LOADING: return "loading"; case SERVER_MODEL_STATUS_LOADED: return "loaded"; + case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; default: return "unknown"; } } @@ -61,8 +66,12 @@ struct server_model_meta { int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown - bool is_active() const { - return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; + bool is_ready() const { + return status == SERVER_MODEL_STATUS_LOADED; + } + + bool is_running() const { + return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING || status == SERVER_MODEL_STATUS_SLEEPING; } bool is_failed() const { @@ -130,19 +139,26 @@ public: void update_status(const std::string & name, server_model_status status, int exit_code); // wait until the model instance is fully loaded (thread-safe) - // return when the model is loaded or failed to load - void wait_until_loaded(const std::string & name); + // return when the model no longer in "loading" state + void wait_until_loading_finished(const std::string & name); - // load the model if not loaded, otherwise do nothing (thread-safe) - // return false if model is already loaded; return true otherwise (meta may need to be refreshed) - bool ensure_model_loaded(const std::string & name); + // ensure the model is in ready state (thread-safe) + // return false if model is ready + // otherwise, load the model and blocking wait until it's ready, then return true (meta may need to be refreshed) + bool ensure_model_ready(const std::string & name); // proxy an HTTP request to the model instance server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); + // return true if the current process is a child server instance + static bool is_child_server(); + // notify the router server that a model instance is ready // return the monitoring thread (to be joined by the caller) static std::thread setup_child_server(const std::function & shutdown_handler); + + // notify the router server that the sleeping state has changed + static void notify_router_sleeping_state(bool sleeping); }; struct server_models_routes { diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 164f09b195..35f010401f 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -95,11 +95,19 @@ public: callback_update_slots = std::move(callback); } - // Register callback for sleeping state change + // Register callback for sleeping state change; multiple callbacks are allowed // note: when entering sleeping state, the callback is called AFTER sleeping is set to true // when leaving sleeping state, the callback is called BEFORE sleeping is set to false void on_sleeping_state(std::function callback) { - callback_sleeping_state = std::move(callback); + if (callback_sleeping_state) { + auto prev_callback = std::move(callback_sleeping_state); + callback_sleeping_state = [prev_callback, callback](bool sleeping) { + prev_callback(sleeping); + callback(sleeping); + }; + } else { + callback_sleeping_state = std::move(callback); + } } private: diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 2a0cf1bcf9..ef54a46b19 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -271,6 +271,12 @@ int main(int argc, char ** argv) { // load the model LOG_INF("%s: loading model\n", __func__); + if (server_models::is_child_server()) { + ctx_server.on_sleeping_changed([&](bool sleeping) { + server_models::notify_router_sleeping_state(sleeping); + }); + } + if (!ctx_server.load_model(params)) { clean_up(); if (ctx_http.thread.joinable()) { @@ -321,9 +327,8 @@ int main(int argc, char ** argv) { LOG_INF("%s: starting the main loop...\n", __func__); // optionally, notify router server that this instance is ready - const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); std::thread monitor_thread; - if (router_port != nullptr) { + if (server_models::is_child_server()) { monitor_thread = server_models::setup_child_server(shutdown_handler); } diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py index ba41cd44ea..b1a5ab9da4 100755 --- a/tools/server/tests/unit/test_tool_call.py +++ b/tools/server/tests/unit/test_tool_call.py @@ -9,6 +9,7 @@ sys.path.insert(0, str(path)) from utils import * from enum import Enum +from typing import TypedDict server: ServerProcess @@ -29,56 +30,73 @@ class CompletionMode(Enum): NORMAL = "normal" STREAMED = "streamed" -TEST_TOOL = { - "type":"function", - "function": { - "name": "test", - "description": "", - "parameters": { - "type": "object", - "properties": { - "success": {"type": "boolean", "const": True}, - }, - "required": ["success"] - } - } -} +class ToolParameters(TypedDict): + type: str + properties: dict[str, dict] + required: list[str] -PYTHON_TOOL = { - "type": "function", - "function": { - "name": "python", - "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", - "parameters": { - "type": "object", - "properties": { +class ToolFunction(TypedDict): + name: str + description: str + parameters: ToolParameters + +class ToolDefinition(TypedDict): + type: str + function: ToolFunction + +TEST_TOOL = ToolDefinition( + type = "function", + function = ToolFunction( + name = "test", + description = "", + parameters = ToolParameters( + type = "object", + properties = { + "success": { + "type": "boolean", + "const": True, + }, + }, + required = ["success"], + ), + ), +) + +PYTHON_TOOL = ToolDefinition( + type = "function", + function = ToolFunction( + name = "python", + description = "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", + parameters = ToolParameters( + type = "object", + properties = { "code": { "type": "string", - "description": "The code to run in the ipython interpreter." - } + "description": "The code to run in the ipython interpreter.", + }, }, - "required": ["code"] - } - } -} + required = ["code"], + ), + ), +) -WEATHER_TOOL = { - "type":"function", - "function":{ - "name":"get_current_weather", - "description":"Get the current weather in a given location", - "parameters":{ - "type":"object", - "properties":{ - "location":{ - "type":"string", - "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'" - } - }, - "required":["location"] - } - } -} +WEATHER_TOOL = ToolDefinition( + type = "function", + function = ToolFunction( + name = "get_current_weather", + description = "Get the current weather in a given location", + parameters = ToolParameters( + type = "object", + properties = { + "location": { + "type": "string", + "description": "The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'", + }, + }, + required = ["location"], + ), + ), +) def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs): body = server.make_any_request("POST", "/v1/chat/completions", data={ diff --git a/tools/server/webui/src/lib/constants/settings-config.ts b/tools/server/webui/src/lib/constants/settings-config.ts index 39aaf561bb..ae9dd3ce8f 100644 --- a/tools/server/webui/src/lib/constants/settings-config.ts +++ b/tools/server/webui/src/lib/constants/settings-config.ts @@ -127,7 +127,7 @@ export const SETTING_CONFIG_INFO: Record = { fullHeightCodeBlocks: 'Always display code blocks at their full natural height, overriding any height limits.', showRawModelNames: - 'Display full raw model identifiers (e.g. "unsloth/Qwen3.5-27B-GGUF:BF16") instead of parsed names with badges.', + 'Display full raw model identifiers (e.g. "ggml-org/GLM-4.7-Flash-GGUF:Q8_0") instead of parsed names with badges.', mcpServers: 'Configure MCP servers as a JSON list. Use the form in the MCP Client settings section to edit.', mcpServerUsageStats: diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.ts b/tools/server/webui/src/lib/services/parameter-sync.service.ts index 1acb5ce453..9a290129eb 100644 --- a/tools/server/webui/src/lib/services/parameter-sync.service.ts +++ b/tools/server/webui/src/lib/services/parameter-sync.service.ts @@ -159,6 +159,74 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [ serverKey: 'fullHeightCodeBlocks', type: SyncableParameterType.BOOLEAN, canSync: true + }, + { + key: 'systemMessage', + serverKey: 'systemMessage', + type: SyncableParameterType.STRING, + canSync: true + }, + { + key: 'showSystemMessage', + serverKey: 'showSystemMessage', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { key: 'theme', serverKey: 'theme', type: SyncableParameterType.STRING, canSync: true }, + { + key: 'copyTextAttachmentsAsPlainText', + serverKey: 'copyTextAttachmentsAsPlainText', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { + key: 'showRawOutputSwitch', + serverKey: 'showRawOutputSwitch', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { + key: 'alwaysShowSidebarOnDesktop', + serverKey: 'alwaysShowSidebarOnDesktop', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { + key: 'autoShowSidebarOnNewChat', + serverKey: 'autoShowSidebarOnNewChat', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { + key: 'showRawModelNames', + serverKey: 'showRawModelNames', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { key: 'mcpServers', serverKey: 'mcpServers', type: SyncableParameterType.STRING, canSync: true }, + { + key: 'agenticMaxTurns', + serverKey: 'agenticMaxTurns', + type: SyncableParameterType.NUMBER, + canSync: true + }, + { + key: 'agenticMaxToolPreviewLines', + serverKey: 'agenticMaxToolPreviewLines', + type: SyncableParameterType.NUMBER, + canSync: true + }, + { + key: 'showToolCallInProgress', + serverKey: 'showToolCallInProgress', + type: SyncableParameterType.BOOLEAN, + canSync: true + }, + { + key: 'alwaysShowAgenticTurns', + serverKey: 'alwaysShowAgenticTurns', + type: SyncableParameterType.BOOLEAN, + canSync: true } ]; diff --git a/tools/server/webui/src/lib/stores/models.svelte.ts b/tools/server/webui/src/lib/stores/models.svelte.ts index a6d7d6572f..50c32034a6 100644 --- a/tools/server/webui/src/lib/stores/models.svelte.ts +++ b/tools/server/webui/src/lib/stores/models.svelte.ts @@ -457,7 +457,7 @@ class ModelsStore { /** * Select a model by its model name (used for syncing with conversation model) - * @param modelName - Model name to select (e.g., "unsloth/gemma-3-12b-it-GGUF:latest") + * @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF") */ selectModelByName(modelName: string): void { const option = this.models.find((model) => model.model === modelName); diff --git a/tools/server/webui/src/lib/stores/settings.svelte.ts b/tools/server/webui/src/lib/stores/settings.svelte.ts index 2fbff8312f..9d5e77adf2 100644 --- a/tools/server/webui/src/lib/stores/settings.svelte.ts +++ b/tools/server/webui/src/lib/stores/settings.svelte.ts @@ -287,8 +287,12 @@ class SettingsStore { */ resetParameterToServerDefault(key: string): void { const serverDefaults = this.getServerDefaults(); + const webuiSettings = serverStore.webuiSettings; - if (serverDefaults[key] !== undefined) { + if (webuiSettings && key in webuiSettings) { + // UI setting from admin config: write actual value + setConfigValue(this.config, key, webuiSettings[key]); + } else if (serverDefaults[key] !== undefined) { // sampling param known by server: clear it, let server decide setConfigValue(this.config, key, ''); } else if (key in SETTING_CONFIG_DEFAULT) { @@ -327,6 +331,17 @@ class SettingsStore { } } + // webui settings need actual values in config (no placeholder mechanism), + // so write them for non-overridden keys + const webuiSettings = serverStore.webuiSettings; + if (webuiSettings) { + for (const [key, value] of Object.entries(webuiSettings)) { + if (!this.userOverrides.has(key) && value !== undefined) { + setConfigValue(this.config, key, value); + } + } + } + this.saveConfig(); console.log('User overrides after sync:', Array.from(this.userOverrides)); } @@ -338,8 +353,14 @@ class SettingsStore { */ forceSyncWithServerDefaults(): void { const propsDefaults = this.getServerDefaults(); + const webuiSettings = serverStore.webuiSettings; + for (const key of ParameterSyncService.getSyncableParameterKeys()) { - if (propsDefaults[key] !== undefined) { + if (webuiSettings && key in webuiSettings) { + // UI setting from admin config: write actual value + setConfigValue(this.config, key, webuiSettings[key]); + } else if (propsDefaults[key] !== undefined) { + // sampling param: clear it, let server decide setConfigValue(this.config, key, ''); } else if (key in SETTING_CONFIG_DEFAULT) { setConfigValue(this.config, key, getConfigValue(SETTING_CONFIG_DEFAULT, key)); diff --git a/ty.toml b/ty.toml new file mode 100644 index 0000000000..bcd23db9b8 --- /dev/null +++ b/ty.toml @@ -0,0 +1,30 @@ +[environment] +extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"] +python-version = "3.10" + +[rules] +deprecated = "warn" + +[src] +exclude = [ + "./tools/mtmd/legacy-models/**", +] + +[[overrides]] +include = [ + "./tools/server/tests/**", +] + +[overrides.rules] +unresolved-reference = "ignore" +unresolved-import = "ignore" +unresolved-attribute = "ignore" + +[[overrides]] +include = [ + "./examples/pydantic_models_to_grammar.py", +] + +[overrides.rules] +unsupported-operator = "ignore" +not-subscriptable = "ignore"