diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1e24293645..7db8552865 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -676,6 +676,35 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + macOS-latest-cmake-visionos: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=visionOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + macOS-latest-swift: runs-on: macos-latest @@ -1379,7 +1408,7 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | - zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework + zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} diff --git a/CMakeLists.txt b/CMakeLists.txt index e239291792..a5eb992db6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,8 @@ else() set(LLAMA_STANDALONE OFF) endif() +option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF) + if (EMSCRIPTEN) set(BUILD_SHARED_LIBS_DEFAULT OFF) @@ -146,7 +148,13 @@ endif() # 3rd-party # -if (NOT TARGET ggml) +if (LLAMA_USE_SYSTEM_GGML) + message(STATUS "Using system-provided libggml, skipping ggml build") + find_package(ggml REQUIRED) + add_library(ggml ALIAS ggml::ggml) +endif() + +if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() diff --git a/build-xcframework.sh b/build-xcframework.sh index 37833dc4ea..2ce3939c43 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \ -DCMAKE_SYSTEM_NAME=visionOS \ -DCMAKE_OSX_SYSROOT=xros \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \ + -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ -S . cmake --build build-visionos --config Release -- -quiet @@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \ -DCMAKE_SYSTEM_NAME=visionOS \ -DCMAKE_OSX_SYSROOT=xrsimulator \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \ + -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ -S . cmake --build build-visionos-sim --config Release -- -quiet diff --git a/cmake/common.cmake b/cmake/common.cmake index 0f54871e41..a5bb787f15 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -1,3 +1,5 @@ +include("ggml/cmake/common.cmake") + function(llama_add_compile_flags) if (LLAMA_FATAL_WARNINGS) if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") diff --git a/common/arg.cpp b/common/arg.cpp index fe6a1eece7..b6bfe6f89b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -853,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sysf", "--system-prompt-file"}, "FNAME", + "a file containing the system prompt (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.system_prompt)); + if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { + params.system_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -1875,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/common/common.cpp b/common/common.cpp index 8487e3834b..18ffb4e738 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1033,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + llama_set_warmup(lctx, true); + std::vector tmp; llama_token bos = llama_vocab_bos(vocab); llama_token eos = llama_vocab_eos(vocab); @@ -1063,6 +1065,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); + llama_set_warmup(lctx, false); } iparams.model.reset(model); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b5d95bd563..d21edce16b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -180,7 +180,8 @@ class Model: extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") else: raise ValueError("Mismatch between weight map and model parts for tensor names:\n" f"Missing tensors: {missing}\n" @@ -528,6 +529,8 @@ class Model: reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -537,13 +540,13 @@ class Model: if token in added_vocab: # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not tokenizer.added_tokens_decoder[i].normalized: + if not added_tokens_decoder[i].normalized: previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): + if added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: # NOTE: this was added for Gemma. @@ -908,6 +911,40 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") @@ -1065,13 +1102,6 @@ class BloomModel(Model): tensors.append((self.map_tensor_name(name), data_torch)) - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -1713,6 +1743,25 @@ class LlamaModel(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("Mistral3ForConditionalGeneration") +class Mistral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + # we need to merge the text_config into the root level of hparams + def __init__(self, *args, **kwargs): + hparams = Model.load_hparams(kwargs["dir_model"]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + if "multi_modal_projector" in name or "vision_tower" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + @Model.register("DeciLMForCausalLM") class DeciModel(Model): model_arch = gguf.MODEL_ARCH.DECI @@ -2370,10 +2419,6 @@ class GPT2Model(Model): tensors.append((new_name, data_torch)) - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -2703,21 +2748,26 @@ class CodeShellModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) + _has_tok_embd = False + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + new_name = self.map_tensor_name(name) - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + # assuming token_embd.weight is seen before output.weight + if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + # even though the tensor file(s) does not contain the word embeddings they are still in the weight map + if self.tensor_names and "transformer.wte.weight" in self.tensor_names: + logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied") + self.tensor_names.remove("transformer.wte.weight") + elif new_name == tok_embd_name: + self._has_tok_embd = True - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None - - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors + return [(new_name, data_torch)] @Model.register("InternLM2ForCausalLM") @@ -3412,38 +3462,7 @@ class Rwkv6Model(Model): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = [''.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_rwkv_world() def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -3565,6 +3584,168 @@ class RWKV6Qwen2Model(Rwkv6Model): yield (new_name, data) +@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(Model): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) + + # ICLR: In-Context-Learning-Rate + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + # already concatenated + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) for t in [ + "time_mix_w1.weight", "time_mix_w2.weight", + "time_mix_a1.weight", "time_mix_a2.weight", + "time_mix_v1.weight", "time_mix_v2.weight", + "time_mix_g1.weight", "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if 'r_k' in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + # dummy v0/v1/v2 on first layer + # easist way to make llama happy + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@Model.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + hidden_size = self.hparams["hidden_size"] + head_size = self.hparams["head_size"] + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + # ICLR: In-Context-Learning-Rate + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_token_shift_count(1) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 5da439e94e..f97e696aaa 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -237,6 +237,15 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB cmake --build buildWithCublas --config Release ``` +**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target: + +```sh +git clone https://github.com/oneapi-src/oneDNN.git +cd oneDNN +cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake --build build-nvidia --config Release +``` + - **Adding support to AMD GPUs** **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit. @@ -327,10 +336,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl # Option 2: Use FP16 -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl # build all binary cmake --build build --config Release -j -v @@ -660,8 +669,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |--------------------|---------------------------------------|---------------------------------------------| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | -| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | +| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -671,6 +681,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | +| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/examples/main/README.md b/examples/main/README.md index f7c2497294..e4b3590b5d 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -27,12 +27,24 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Input prompt (One-and-done) ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time" +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" ``` ##### Conversation mode (Allow for continuous interaction with the model) ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```bash +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```bash +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" ``` ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -44,12 +56,24 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Input prompt (One-and-done) ```powershell -./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time" +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" ``` ##### Conversation mode (Allow for continuous interaction with the model) ```powershell -./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```powershell +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```powershell +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" ``` #### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -77,6 +101,8 @@ The `llama-cli` program provides several ways to interact with the LLaMA models - `--prompt PROMPT`: Provide a prompt directly as a command-line option. - `--file FNAME`: Provide a file containing a prompt or multiple prompts. +- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)). +- `--system-prompt-file FNAME`: Provide a file containing a system prompt. - `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) ## Interaction @@ -89,7 +115,10 @@ In interactive mode, users can participate in text generation by injecting their - `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. - `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. -- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false) +- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found) +- `-no-cnv`: Disable conversation mode (default: false) +- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit. +- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false) - `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. @@ -125,6 +154,8 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t Example usage: `--chat-template gemma` +`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py + ## Context Management During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 437f2533e5..462a6d1519 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -79,6 +79,7 @@ class Opt { ctx_params = llama_context_default_params(); model_params = llama_model_default_params(); context_size_default = ctx_params.n_batch; + n_threads_default = ctx_params.n_threads; ngl_default = model_params.n_gpu_layers; common_params_sampling sampling; temperature_default = sampling.temp; @@ -104,6 +105,7 @@ class Opt { ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; ctx_params.n_ctx = ctx_params.n_batch; + ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default; model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; temperature = temperature >= 0 ? temperature : temperature_default; @@ -116,12 +118,12 @@ class Opt { std::string chat_template_file; std::string user; bool use_jinja = false; - int context_size = -1, ngl = -1; + int context_size = -1, ngl = -1, n_threads = -1; float temperature = -1; bool verbose = false; private: - int context_size_default = -1, ngl_default = -1; + int context_size_default = -1, ngl_default = -1, n_threads_default = -1; float temperature_default = -1; bool help = false; @@ -159,53 +161,94 @@ class Opt { return 0; } + int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { + if (handle_option_with_value(argc, argv, i, context_size) == 1) { + return 1; + } + } else if (options_parsing && + (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { + if (handle_option_with_value(argc, argv, i, ngl) == 1) { + return 1; + } + } else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) { + if (handle_option_with_value(argc, argv, i, n_threads) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { + if (handle_option_with_value(argc, argv, i, temperature) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) { + if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { + return 1; + } + use_jinja = true; + } else { + return 2; + } + + return 0; + } + + int parse_options(const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { + verbose = true; + } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { + use_jinja = true; + } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { + help = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else { + return 2; + } + + return 0; + } + + int parse_positional_args(const char ** argv, int & i, int & positional_args_i) { + if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { + return 1; + } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user = argv[i]; + } else { + user += " " + std::string(argv[i]); + } + + return 0; + } + int parse(int argc, const char ** argv) { bool options_parsing = true; for (int i = 1, positional_args_i = 0; i < argc; ++i) { - if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { - if (handle_option_with_value(argc, argv, i, context_size) == 1) { - return 1; - } - } else if (options_parsing && - (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { - if (handle_option_with_value(argc, argv, i, ngl) == 1) { - return 1; - } - } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { - if (handle_option_with_value(argc, argv, i, temperature) == 1) { - return 1; - } - } else if (options_parsing && - (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { - verbose = true; - } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { - use_jinja = true; - } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){ - if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { - return 1; - } - use_jinja = true; - } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { - help = true; - return 0; - } else if (options_parsing && strcmp(argv[i], "--") == 0) { - options_parsing = false; - } else if (positional_args_i == 0) { - if (!argv[i][0] || argv[i][0] == '-') { - return 1; - } + int ret = parse_options_with_value(argc, argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user = argv[i]; - } else { - user += " " + std::string(argv[i]); + ret = parse_options(argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } + + if (parse_positional_args(argv, i, positional_args_i)) { + return 1; } } - if (model_.empty()){ + if (model_.empty()) { return 1; } @@ -232,6 +275,8 @@ class Opt { " Number of GPU layers (default: %d)\n" " --temp \n" " Temperature (default: %.1f)\n" + " -t, --threads \n" + " Number of threads to use during generation (default: %d)\n" " -v, --verbose, --log-verbose\n" " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" " -h, --help\n" @@ -260,7 +305,7 @@ class Opt { " llama-run file://some-file3.gguf\n" " llama-run --ngl 999 some-file4.gguf\n" " llama-run --ngl 999 some-file5.gguf Hello World\n", - context_size_default, ngl_default, temperature_default); + context_size_default, ngl_default, temperature_default, n_threads_default); } }; diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index c7a3c426b6..d0e6da8e4a 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 71e053b202..c2f1afeca4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1872,6 +1872,10 @@ struct server_context { params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; + // force F16 KV cache for the draft model for extra performance + params_dft.cache_type_k = GGML_TYPE_F16; + params_dft.cache_type_v = GGML_TYPE_F16; + llama_init_dft = common_init_from_params(params_dft); model_dft = llama_init_dft.model.get(); @@ -1892,10 +1896,6 @@ struct server_context { cparams_dft = common_context_params_to_llama(params_dft); cparams_dft.n_batch = n_ctx_dft; - // force F16 KV cache for the draft model for extra performance - cparams_dft.type_k = GGML_TYPE_F16; - cparams_dft.type_v = GGML_TYPE_F16; - // the context is not needed - we will create one for each slot llama_init_dft.context.reset(); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 36ad276fd3..58cdd6af92 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse( llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; + if (!chat_params.grammar.empty()) { + llama_params["grammar"] = chat_params.grammar; + } llama_params["grammar_lazy"] = chat_params.grammar_lazy; auto grammar_triggers = json::array(); for (const auto & trigger : chat_params.grammar_triggers) { diff --git a/examples/server/webui/src/components/ChatScreen.tsx b/examples/server/webui/src/components/ChatScreen.tsx index 79de305326..d12b06e125 100644 --- a/examples/server/webui/src/components/ChatScreen.tsx +++ b/examples/server/webui/src/components/ChatScreen.tsx @@ -99,13 +99,9 @@ export default function ChatScreen() { canvasData, replaceMessageAndGenerate, } = useAppContext(); - const [inputMsg, setInputMsg] = useState(prefilledMsg.content()); - const inputRef = useRef(null); + const textarea = useOptimizedTextarea(prefilledMsg.content()); - const { extraContext, clearExtraContext } = useVSCodeContext( - inputRef, - setInputMsg - ); + const { extraContext, clearExtraContext } = useVSCodeContext(textarea); // TODO: improve this when we have "upload file" feature const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined; @@ -135,9 +131,10 @@ export default function ChatScreen() { }; const sendNewMessage = async () => { - if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return; - const lastInpMsg = inputMsg; - setInputMsg(''); + const lastInpMsg = textarea.value(); + if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) + return; + textarea.setValue(''); scrollToBottom(false); setCurrNodeId(-1); // get the last message node @@ -146,13 +143,13 @@ export default function ChatScreen() { !(await sendMessage( currConvId, lastMsgNodeId, - inputMsg, + lastInpMsg, currExtra, onChunk )) ) { // restore the input message if failed - setInputMsg(lastInpMsg); + textarea.setValue(lastInpMsg); } // OK clearExtraContext(); @@ -195,16 +192,13 @@ export default function ChatScreen() { // send the prefilled message if needed sendNewMessage(); } else { - // otherwise, focus on the input and move the cursor to the end - if (inputRef.current) { - inputRef.current.focus(); - inputRef.current.selectionStart = inputRef.current.value.length; - } + // otherwise, focus on the input + textarea.focus(); } prefilledMsg.clear(); // no need to keep track of sendNewMessage // eslint-disable-next-line react-hooks/exhaustive-deps - }, [inputRef]); + }, [textarea.ref]); // due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg) const pendingMsgDisplay: MessageDisplay[] = @@ -258,9 +252,7 @@ export default function ChatScreen() {