Merge branch 'master' into quantize

This commit is contained in:
Ed Addario 2025-10-25 12:18:54 +01:00
commit 8da14c0fc8
No known key found for this signature in database
GPG Key ID: E7875815A3230993
101 changed files with 15414 additions and 428 deletions

View File

@ -1305,6 +1305,81 @@ jobs:
cd examples/llama.android
./gradlew build --no-daemon
android-ndk-build:
runs-on: ubuntu-latest
env:
OPENCL_VERSION: 2025.07.22
strategy:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
mkdir opencl
curl -L -o opencl/clhpp.tar.gz https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
curl -L -o opencl/headers.tar.gz https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
tar -xaf opencl/headers.tar.gz -C opencl
tar -xaf opencl/clhpp.tar.gz -C opencl
tar -xaf opencl/icd-loader.tar.gz -C opencl
sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
cmake --build build
sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
rm -rf opencl
- name: Install Hexagon SDK
id: install_hexsdk
if: ${{ matrix.build == 'arm64-snapdragon' }}
env:
HEXSDK_VER: 6.4.0.2
HEXTLS_VER: 19.0.04
run: |
curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
mkdir hex-sdk
tar -xaf hex-sdk.tar.gz -C hex-sdk
ls -l hex-sdk
sudo mv hex-sdk /opt/hexagon
echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER" >> "$GITHUB_ENV"
echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER" >> "$GITHUB_ENV"
echo "DEFAULT_HLOS_ARCH=64" >> "$GITHUB_ENV"
echo "DEFAULT_TOOLS_VARIANT=toolv19" >> "$GITHUB_ENV"
echo "DEFAULT_NO_QURT_INC=0" >> "$GITHUB_ENV"
echo "DEFAULT_DSP_ARCH=v73" >> "$GITHUB_ENV"
- name: Update CMake presets
id: update_presets
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
cp docs/backend/hexagon/CMakeUserPresets.json .
- name: Build
id: ndk_build
run: |
cmake ${{ matrix.defines }} -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Test
id: cmake_test
run: |
echo "FIXME: test on devices"
openEuler-latest-cmake-cann:
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:

View File

@ -65,6 +65,7 @@
/ggml/src/ggml-impl.h @ggerganov @slaren
/ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
/ggml/src/ggml-hexagon/ @max-krasnyansky
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov

View File

@ -280,6 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
## Obtaining and quantizing models

View File

@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"

View File

@ -29,13 +29,30 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab
from mistral_common.tokens.tokenizers.base import TokenizerVersion
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from mistral_common.tokens.tokenizers.sentencepiece import (
try:
from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
SentencePieceTokenizer,
)
_mistral_common_installed = True
_mistral_import_error_msg = ""
except ImportError:
_MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
_MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
_mistral_common_installed = False
TokenizerVersion = None
Tekkenizer = None
SentencePieceTokenizer = None
_mistral_import_error_msg = (
"Mistral format requires `mistral-common` to be installed. Please run "
"`pip install mistral-common[image,audio]` to install it."
)
logger = logging.getLogger("hf-to-gguf")
@ -73,10 +90,8 @@ class ModelBase:
use_temp_file: bool
lazy: bool
dry_run: bool
part_names: list[str]
is_safetensors: bool
hparams: dict[str, Any]
tensor_names: set[str] | None
model_tensors: dict[str, Callable[[], Tensor]]
gguf_writer: gguf.GGUFWriter
model_name: str | None
metadata_override: Path | None
@ -107,6 +122,9 @@ class ModelBase:
type(self) is MmprojModel:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
if self.is_mistral_format and not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)
self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
@ -117,25 +135,8 @@ class ModelBase:
self.dry_run = dry_run
self.remote_hf_model_id = remote_hf_model_id
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
if remote_hf_model_id is not None:
self.is_safetensors = True
def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
self.tensor_names = set(name for name in remote_tensors.keys())
for name, remote_tensor in remote_tensors.items():
yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
self.get_tensors = get_remote_tensors
else:
prefix = "model" if not self.is_mistral_format else "consolidated"
self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
self.tensor_names = None
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
self.metadata_override = metadata_override
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
@ -151,6 +152,8 @@ class ModelBase:
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
self.dequant_model()
# Configure GGUF Writer
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
@ -172,59 +175,79 @@ class ModelBase:
return None
raise KeyError(f"could not find any of: {keys}")
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_names_from_parts: set[str] = set()
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
tensors: dict[str, Callable[[], Tensor]] = {}
if remote_hf_model_id is not None:
is_safetensors = True
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
for name, remote_tensor in remote_tensors.items():
tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)
return tensors
prefix = "model" if not self.is_mistral_format else "consolidated"
part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
is_safetensors: bool = len(part_names) > 0
if not is_safetensors:
part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
tensor_names_from_index: set[str] = set()
if not self.is_mistral_format:
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
index_file = self.dir_model / index_name
if index_file.is_file():
self.tensor_names = set()
logger.info(f"gguf: loading model weight map from '{index_name}'")
with open(index_file, "r", encoding="utf-8") as f:
index: dict[str, Any] = json.load(f)
weight_map = index.get("weight_map")
if weight_map is None or not isinstance(weight_map, dict):
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
self.tensor_names.update(weight_map.keys())
tensor_names_from_index.update(weight_map.keys())
else:
self.tensor_names = tensor_names_from_parts
weight_map = {}
else:
self.tensor_names = tensor_names_from_parts
weight_map = {}
for part_name in self.part_names:
logger.info(f"gguf: loading model part '{part_name}'")
for part_name in part_names:
logger.info(f"gguf: indexing model part '{part_name}'")
ctx: ContextManager[Any]
if self.is_safetensors:
if is_safetensors:
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else:
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
with ctx as model_part:
tensor_names_from_parts.update(model_part.keys())
assert model_part is not None
for name in model_part.keys():
if self.is_safetensors:
if is_safetensors:
if self.lazy:
data = model_part.get_slice(name)
data = LazyTorchTensor.from_safetensors_slice(data)
data_gen = lambda data=data: LazyTorchTensor.from_safetensors_slice(data) # noqa: E731
else:
data = model_part.get_tensor(name)
data_gen = lambda data=data: data # noqa: E731
else:
data = model_part[name]
if self.lazy:
data = LazyTorchTensor.from_eager(data)
yield name, data
data_gen = lambda data=data: LazyTorchTensor.from_eager(data) # noqa: E731
else:
data_gen = lambda data=data: data # noqa: E731
tensors[name] = data_gen
# verify tensor name presence and identify potentially missing files
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
if len(tensor_names_from_index) > 0:
tensor_names_from_parts = set(tensors.keys())
if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0:
missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts))
extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index))
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
if len(extra) == 0 and len(missing_files) > 0:
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
@ -234,6 +257,134 @@ class ModelBase:
f"Missing tensors: {missing}\n"
f"Extra tensors: {extra}")
return tensors
def dequant_model(self):
tensors_to_remove: list[str] = []
new_tensors: dict[str, Callable[[], Tensor]] = {}
if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict):
quant_method = quant_config.get("quant_method")
def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor:
weight = weight.view(torch.uint8)
orig_shape = weight.shape
shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape)))))
data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift
data = data & 3
data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:]))
# The scale is inverted
return data / scale.float()
def dequant_simple(weight: Tensor, scale: Tensor) -> Tensor:
scale = scale.float()
if (weight_block_size := quant_config.get("weight_block_size")):
# TODO: make sure it's a list of integers
for i, size in enumerate(weight_block_size):
scale = scale.repeat_interleave(size, i)
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
scale = scale[tuple(slice(0, size) for size in weight.shape)]
return weight.float() * scale
# ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor:
bits = quant_config["bits"]
assert bits in (2, 3, 4, 8)
assert qweight.dtype == qzeros.dtype
maxq = (2 ** bits) - 1
weight = None
zeros = None
pack_dtype_bits = qweight.dtype.itemsize * 8
if bits in [2, 4, 8]:
pack_factor = pack_dtype_bits // bits
wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0)
if self.lazy:
wf = LazyTorchTensor.from_eager(wf)
zeros = torch.bitwise_right_shift(
qzeros.unsqueeze(2).expand(-1, -1, pack_factor),
wf.unsqueeze(0)
).to(torch.int16 if bits == 8 else torch.int8)
zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape)
weight = torch.bitwise_and(
torch.bitwise_right_shift(
qweight.unsqueeze(1).expand(-1, pack_factor, -1),
wf.unsqueeze(-1)
).to(torch.int16 if bits == 8 else torch.int8),
maxq
)
elif bits == 3:
raise NotImplementedError("3-bit gptq dequantization is not yet implemented")
assert weight is not None
assert zeros is not None
weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
# gptq_v2 doesn't need to offset zeros
if quant_config.get("checkpoint_format", "gptq") == "gptq":
zeros += 1
return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
if quant_method == "bitnet":
for name in self.model_tensors.keys():
if name.endswith(".weight_scale"):
weight_name = name.removesuffix("_scale")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
tensors_to_remove.append(name)
elif quant_method == "fp8":
for name in self.model_tensors.keys():
if name.endswith(".weight_scale_inv"):
weight_name = name.removesuffix("_scale_inv")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s())
tensors_to_remove.append(name)
elif quant_method == "gptq":
for name in self.model_tensors.keys():
if name.endswith(".qweight"):
base_name = name.removesuffix(".qweight")
g_idx = self.model_tensors[base_name + ".g_idx"]
qweight = self.model_tensors[base_name + ".qweight"]
qzeros = self.model_tensors[base_name + ".qzeros"]
scales = self.model_tensors[base_name + ".scales"]
new_tensors[base_name + ".weight"] = (
lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
g(), w(), z(), s()
)
)
tensors_to_remove += [
base_name + n
for n in (
".g_idx",
".qzeros",
".qweight",
".scales",
)
]
else:
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
for name in tensors_to_remove:
if name in self.model_tensors:
del self.model_tensors[name]
for name, value in new_tensors.items():
self.model_tensors[name] = value
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for name, gen in self.model_tensors.items():
yield name, gen()
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
if key not in gguf.MODEL_TENSORS[self.model_arch]:
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
@ -1363,8 +1514,8 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
# preprocessor config
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
self.gguf_writer.add_vision_image_mean(image_mean)
self.gguf_writer.add_vision_image_std(image_std)
@ -2033,6 +2184,9 @@ class LlamaModel(TextModel):
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
def _set_vocab_mistral(self):
if not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)
vocab = MistralVocab(self.dir_model)
logger.info(
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
@ -4358,27 +4512,6 @@ class CodeShellModel(TextModel):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(1.0)
_has_tok_embd = False
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
new_name = self.map_tensor_name(name)
# assuming token_embd.weight is seen before output.weight
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
self.tensor_names.remove("transformer.wte.weight")
elif new_name == tok_embd_name:
self._has_tok_embd = True
return [(new_name, data_torch)]
@ModelBase.register("InternLM2ForCausalLM")
class InternLM2Model(TextModel):
@ -8810,6 +8943,13 @@ class SmolLM3Model(LlamaModel):
class GptOssModel(TextModel):
model_arch = gguf.MODEL_ARCH.GPT_OSS
# TODO: remove once MXFP4 is supported more generally
def dequant_model(self):
quant_config = self.hparams.get("quantization_config")
if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
return
return super().dequant_model()
def transform_nibble_layout(self, tensor):
assert tensor.dtype == torch.uint8
assert tensor.shape[-1] == 16
@ -9212,7 +9352,7 @@ class MistralModel(LlamaModel):
@staticmethod
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
assert TokenizerVersion is not None, "mistral_common is not installed"
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
)
@ -9594,6 +9734,8 @@ def main() -> None:
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
is_mistral_format = args.mistral_format
if is_mistral_format and not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)
disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
with torch.inference_mode():

View File

@ -0,0 +1,49 @@
{
"version": 4,
"configurePresets": [
{
"name": "arm64-android-snapdragon",
"hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"ANDROID_ABI": "arm64-v8a",
"ANDROID_PLATFORM": "android-31",
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"PREBUILT_LIB_DIR": "android_aarch64",
"GGML_OPENMP": "OFF",
"GGML_LLAMAFILE": "OFF",
"GGML_OPENCL": "ON",
"GGML_HEXAGON": "ON",
"LLAMA_CURL": "OFF"
}
},
{
"name": "arm64-windows-snapdragon",
"inherits": [ "base", "arm64-windows-llvm" ],
"cacheVariables": {
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"PREBUILT_LIB_DIR": "windows_aarch64",
"GGML_OPENMP": "OFF",
"GGML_LLAMAFILE": "OFF",
"GGML_OPENCL": "ON",
"GGML_HEXAGON": "ON",
"LLAMA_CURL": "OFF"
}
},
{ "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
{ "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
{ "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
{ "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
]
}

View File

@ -0,0 +1,239 @@
# Snapdragon-based Android devices
## How to Build
The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
```
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
[d]/> cd /workspace
```
The rest of the Android build process assumes that you're running inside the toolchain container.
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
```
[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
Preset CMake variables:
ANDROID_ABI="arm64-v8a"
...
CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
GGML_HEXAGON="ON"
GGML_OPENCL="ON"
GGML_OPENMP="OFF"
HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
...
-- Including OpenCL backend
-- Including Hexagon backend
...
-- Build files have been written to: /workspace/build-snapdragon
[d]/workspace> cmake --build build-snapdragon
...
[144/356] Performing build step for 'htp-v73'
[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
...
-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
...
```
To generate an installable "package" simply use cmake --install:
```
[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
-- Install configuration: "Release"
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
...
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
...
```
## How to Install
For this step, your device needs to be configured for on-device development.
Please see https://developer.android.com/studio/debug/dev-options for details.
Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
```
~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
```
At this point, you should also install some models:
```
~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
...
2025-10-11 12:04:52 (10.7 MB/s) - Llama-3.2-1B-Instruct-Q4_0.gguf saved [773025920/773025920]
~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
```
## How to Run
The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
Here are some examples of running various llama.cpp tools via ADB.
Simple question for Llama-3.2-1B
```
~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
...
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
ggml-hex: Hexagon Arch version v79
ggml-hex: allocating new session: HTP0
ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
...
load_tensors: offloading output layer to GPU
load_tensors: offloaded 17/17 layers to GPU
load_tensors: CPU model buffer size = 225.49 MiB
load_tensors: HTP0 model buffer size = 0.26 MiB
load_tensors: HTP0-REPACK model buffer size = 504.00 MiB
...
I hope this helps you understand the world's most popular cookies! [end of text]
...
llama_perf_sampler_print: sampling time = 30.08 ms / 487 runs ( 0.06 ms per token, 16191.77 tokens per second)
llama_perf_context_print: load time = 617.94 ms
llama_perf_context_print: prompt eval time = 80.76 ms / 11 tokens ( 7.34 ms per token, 136.21 tokens per second)
llama_perf_context_print: eval time = 9210.59 ms / 475 runs ( 19.39 ms per token, 51.57 tokens per second)
llama_perf_context_print: total time = 9454.92 ms / 486 tokens
llama_perf_context_print: graphs reused = 473
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - Host | 439 = 225 + 136 + 77 |
llama_memory_breakdown_print: | - HTP0-REPACK | 504 = 504 + 0 + 0 |
```
Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
```
~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
...
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
ggml-hex: Hexagon Arch version v81
ggml-hex: allocating new session: HTP0
ggml-hex: allocating new session: HTP1
...
load_tensors: offloading output layer to GPU
load_tensors: offloaded 17/17 layers to GPU
load_tensors: CPU model buffer size = 143.86 MiB
load_tensors: HTP1 model buffer size = 0.23 MiB
load_tensors: HTP1-REPACK model buffer size = 1575.00 MiB
load_tensors: HTP0 model buffer size = 0.28 MiB
load_tensors: HTP0-REPACK model buffer size = 2025.00 MiB
...
llama_context: CPU output buffer size = 0.19 MiB
llama_kv_cache: HTP1 KV buffer size = 238.00 MiB
llama_kv_cache: HTP0 KV buffer size = 306.00 MiB
llama_kv_cache: size = 544.00 MiB ( 8192 cells, 16 layers, 1/1 seqs), K (q8_0): 272.00 MiB, V (q8_0): 272.00 MiB
llama_context: HTP0 compute buffer size = 15.00 MiB
llama_context: HTP1 compute buffer size = 15.00 MiB
llama_context: CPU compute buffer size = 24.56 MiB
...
llama_perf_context_print: prompt eval time = 1730.57 ms / 212 tokens ( 8.16 ms per token, 122.50 tokens per second)
llama_perf_context_print: eval time = 5624.75 ms / 257 runs ( 21.89 ms per token, 45.69 tokens per second)
llama_perf_context_print: total time = 7377.33 ms / 469 tokens
llama_perf_context_print: graphs reused = 255
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - Host | 742 = 144 + 544 + 54 |
llama_memory_breakdown_print: | - HTP1-REPACK | 1575 = 1575 + 0 + 0 |
llama_memory_breakdown_print: | - HTP0-REPACK | 2025 = 2025 + 0 + 0 |
```
Op test for MUL_MAT
```
~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
...
Backend 2/3: HTP0
Device description: Hexagon
Device memory: 2048 MB (2048 MB free)
MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
...
ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
ggml-hex: Hexagon Arch version v79
ggml-hex: allocating new session: HTP0
ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
| model | size | params | backend | ngl | threads | n_batch | mmap | test | t/s |
| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | pp128 | 169.42 ± 1.75 |
| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | tg64 | 51.54 ± 1.13 |
build: 6a8cf8914 (6733)
```
## Environment variables
- `GGML_HEXAGON_NDEV=1`
Controls the number of devices/sessions to allocate. The default is 1.
Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
- `GGML_HEXAGON_NHVX=0`
Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
- `GGML_HEXAGON_HOSTBUF=1`
Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
- `GGML_HEXAGON_VERBOSE=1`
Enables verbose logging of Ops from the backend. Example output:
```
ggml-hex: HTP0 graph-compute n_nodes 2
ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
ggml-hex: HTP0 graph-compute n_nodes 1
ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
```
- `GGML_HEXAGON_PROFILE=1`
Generates a host-side profile for the ggml-hexagon Ops.
- `GGML_HEXAGON_OPMASK=0x0`
Allows enabling specific stages of the processing pipeline:
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- `0x2` Enable Dynamic Quantizer (if needed for the Op)
- `0x4` Enable Op Compute (MUL_MAT, etc.)
Examples:
`GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
`GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)

View File

@ -0,0 +1,109 @@
# Hexagon backend developer details
## Backend libraries
The Hexagon backend consist of two parts:
- `libggml-hexagon`
This is the regular CPU-side GGML backend library, either shared or statically linked
- `libggml-htp-vNN`
This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
The correct library is selected automatically at runtime based on the HW version.
Here is an example of the build artifacts
```
~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
pkg-adb/llama.cpp/lib/libggml-base.so
pkg-adb/llama.cpp/lib/libggml-cpu.so
pkg-adb/llama.cpp/lib/libggml-hexagon.so <<< CPU library
pkg-adb/llama.cpp/lib/libggml-htp-v73.so <<< HTP op/kernels for Hexagon v73
pkg-adb/llama.cpp/lib/libggml-htp-v75.so
pkg-adb/llama.cpp/lib/libggml-htp-v79.so
pkg-adb/llama.cpp/lib/libggml-htp-v81.so
```
## Memory buffers
Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
to the NPU at this point.
The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
the repacking.
## Large model handling
Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
```
M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
...
LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
-t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
...
llama_model_loader: - type f32: 289 tensors
llama_model_loader: - type q4_0: 96 tensors
llama_model_loader: - type q8_0: 2 tensors
llama_model_loader: - type mxfp4: 72 tensors
...
load_tensors: offloaded 25/25 layers to GPU
load_tensors: CPU model buffer size = 1182.09 MiB
load_tensors: HTP1 model buffer size = 6.64 MiB
load_tensors: HTP1-REPACK model buffer size = 2505.94 MiB
load_tensors: HTP3 model buffer size = 5.55 MiB
load_tensors: HTP3-REPACK model buffer size = 2088.28 MiB
load_tensors: HTP0 model buffer size = 7.75 MiB
load_tensors: HTP0-REPACK model buffer size = 2923.59 MiB
load_tensors: HTP2 model buffer size = 6.64 MiB
load_tensors: HTP2-REPACK model buffer size = 2505.94 MiB
...
llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_context: CPU output buffer size = 0.77 MiB
llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
llama_kv_cache: HTP1 KV buffer size = 25.50 MiB
llama_kv_cache: HTP3 KV buffer size = 25.50 MiB
llama_kv_cache: HTP0 KV buffer size = 25.50 MiB
llama_kv_cache: HTP2 KV buffer size = 25.50 MiB
llama_kv_cache: size = 102.00 MiB ( 8192 cells, 12 layers, 1/1 seqs), K (q8_0): 51.00 MiB, V (q8_0): 51.00 MiB
llama_kv_cache_iswa: creating SWA KV cache, size = 256 cells
llama_kv_cache: HTP1 KV buffer size = 0.80 MiB
llama_kv_cache: HTP3 KV buffer size = 0.53 MiB
llama_kv_cache: HTP0 KV buffer size = 1.06 MiB
llama_kv_cache: HTP2 KV buffer size = 0.80 MiB
llama_kv_cache: size = 3.19 MiB ( 256 cells, 12 layers, 1/1 seqs), K (q8_0): 1.59 MiB, V (q8_0): 1.59 MiB
llama_context: HTP0 compute buffer size = 16.06 MiB
llama_context: HTP1 compute buffer size = 16.06 MiB
llama_context: HTP2 compute buffer size = 16.06 MiB
llama_context: HTP3 compute buffer size = 16.06 MiB
llama_context: CPU compute buffer size = 98.19 MiB
...
llama_perf_context_print: prompt eval time = 3843.67 ms / 197 tokens ( 19.51 ms per token, 51.25 tokens per second)
llama_perf_context_print: eval time = 1686.13 ms / 31 runs ( 54.39 ms per token, 18.39 tokens per second)
llama_perf_context_print: total time = 6266.30 ms / 228 tokens
llama_perf_context_print: graphs reused = 30
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - HTP2 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - HTP3 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
llama_memory_breakdown_print: | - Host | 1476 = 1208 + 105 + 162 |
llama_memory_breakdown_print: | - HTP1-REPACK | 2505 = 2505 + 0 + 0 |
llama_memory_breakdown_print: | - HTP3-REPACK | 2088 = 2088 + 0 + 0 |
llama_memory_breakdown_print: | - HTP0-REPACK | 2923 = 2923 + 0 + 0 |
llama_memory_breakdown_print: | - HTP2-REPACK | 2505 = 2505 + 0 + 0 |
```

View File

@ -72,7 +72,7 @@ Legend:
| OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | | ❌ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |

View File

@ -9379,8 +9379,8 @@
"SYCL0","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","SYCL"
"SYCL0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","SYCL"
"SYCL0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","1","yes","SYCL"
"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","SYCL"
"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","SYCL"
"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","yes","SYCL"
"SYCL0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","yes","SYCL"
"SYCL0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","SYCL"
"SYCL0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","0","no","SYCL"
"SYCL0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","SYCL"

Can't render this file because it is too large.

View File

@ -138,7 +138,7 @@ if model_path is None:
"Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
)
config = AutoConfig.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
print("Model type: ", config.model_type)
print("Vocab size: ", config.vocab_size)
@ -148,8 +148,8 @@ print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
@ -171,7 +171,7 @@ if unreleased_model_name:
exit(1)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map="auto", offload_folder="offload"
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True
)
for name, module in model.named_modules():

View File

@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adr
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
"gmml: OpenCL API version to target")
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

View File

@ -0,0 +1,19 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
#ifdef __cplusplus
}
#endif

View File

@ -402,6 +402,7 @@ ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU)
ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
ggml_add_backend(Hexagon)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View File

@ -57,6 +57,10 @@
#include "ggml-opencl.h"
#endif
#ifdef GGML_USE_HEXAGON
#include "ggml-hexagon.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
@ -199,6 +203,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
#ifdef GGML_USE_HEXAGON
register_backend(ggml_backend_hexagon_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
@ -598,6 +605,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("hexagon", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend

View File

@ -1,5 +1,81 @@
#include "argsort.cuh"
#ifdef GGML_CUDA_USE_CUB
# include <cub/cub.cuh>
using namespace cub;
#endif // GGML_CUDA_USE_CUB
static __global__ void init_indices(int * indices, const int ncols, const int nrows) {
const int col = blockIdx.x * blockDim.x + threadIdx.x;
const int row = blockIdx.y;
if (col < ncols && row < nrows) {
indices[row * ncols + col] = col;
}
}
static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx <= nrows) {
offsets[idx] = idx * ncols;
}
}
#ifdef GGML_CUDA_USE_CUB
static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const float * x,
int * dst,
const int ncols,
const int nrows,
ggml_sort_order order,
cudaStream_t stream) {
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
int * temp_indices = temp_indices_alloc.get();
float * temp_keys = temp_keys_alloc.get();
int * d_offsets = offsets_alloc.get();
static const int block_size = 256;
const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
const dim3 offset_grid((nrows + block_size - 1) / block_size);
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream);
size_t temp_storage_bytes = 0;
if (order == GGML_SORT_ORDER_ASC) {
DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
d_offsets, d_offsets + 1, 0, sizeof(float) * 8, // all bits
stream);
} else {
DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
sizeof(float) * 8, stream);
}
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
void * d_temp_storage = temp_storage_alloc.get();
if (order == GGML_SORT_ORDER_ASC) {
DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
stream);
} else {
DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
0, sizeof(float) * 8, stream);
}
}
#endif // GGML_CUDA_USE_CUB
// Bitonic sort implementation
template<typename T>
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
T tmp = a;
@ -65,7 +141,12 @@ static int next_power_of_2(int x) {
return n;
}
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
static void argsort_f32_i32_cuda_bitonic(const float * x,
int * dst,
const int ncols,
const int nrows,
ggml_sort_order order,
cudaStream_t stream) {
// bitonic sort requires ncols to be power of 2
const int ncols_pad = next_power_of_2(ncols);
@ -77,9 +158,11 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
if (order == GGML_SORT_ORDER_ASC) {
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>
<<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>
<<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else {
GGML_ABORT("fatal error");
}
@ -100,5 +183,18 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
#ifdef GGML_CUDA_USE_CUB
const int ncols_pad = next_power_of_2(ncols);
const size_t shared_mem = ncols_pad * sizeof(int);
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
if (shared_mem > max_shared_mem || ncols > 1024) {
ggml_cuda_pool & pool = ctx.pool();
argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
} else {
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
}
#else
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
#endif
}

View File

@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
if (block_nums.z > 65535) {
if (block_nums.z > 65535 || block_nums.y > 65535) {
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));

View File

@ -895,6 +895,7 @@ void launch_fattn(
const dim3 block_dim(warp_size, nwarps, 1);
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
GGML_ASSERT(max_blocks_per_sm > 0);
int parallel_blocks = max_blocks_per_sm;
dim3 blocks_num;

View File

@ -2818,18 +2818,15 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
#endif
//TODO: remove special case once ggml_can_fuse can handle empty nodes
std::initializer_list<enum ggml_op> topk_moe_ops = ggml_cuda_topk_moe_ops(false);
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm = ggml_cuda_topk_moe_ops(true);
std::initializer_list<enum ggml_op> topk_moe_ops =
ggml_cuda_topk_moe_ops(/*with_norm*/ false, /*delayed_softmax=*/false);
std::initializer_list<enum ggml_op> topk_moe_ops_with_norm =
ggml_cuda_topk_moe_ops(/*with_norm=*/true, /*delayed_softmax=*/false);
std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
if (ops.size() == topk_moe_ops_with_norm.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops_with_norm.begin())) {
if (node_idx + topk_moe_ops_with_norm.size() > (size_t)cgraph->n_nodes) {
return false;
}
for (size_t i = 0; i < topk_moe_ops_with_norm.size(); i++) {
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops_with_norm.begin()[i]) return false;
}
if (ops.size() == topk_moe_ops_with_norm.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx+8];
@ -2838,16 +2835,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if (ops.size() == topk_moe_ops.size() && std::equal(ops.begin(), ops.end(), topk_moe_ops.begin())) {
if (node_idx + topk_moe_ops.size() > (size_t)cgraph->n_nodes) {
return false;
}
for (size_t i = 0; i < topk_moe_ops.size(); i++) {
if (cgraph->nodes[node_idx + i]->op != topk_moe_ops.begin()[i]) return false;
}
if (ops.size() == topk_moe_ops.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx+4];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@ -2855,6 +2844,16 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
return true;
}
}
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
return false;
}
@ -2948,7 +2947,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
ggml_tensor * weights = cgraph->nodes[i+8];
ggml_tensor * selected_experts = cgraph->nodes[i+3];
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ true);
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
/*delayed softmax*/ false);
i += 8;
continue;
}
@ -2956,11 +2956,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
ggml_tensor * weights = cgraph->nodes[i+4];
ggml_tensor * selected_experts = cgraph->nodes[i+3];
ggml_cuda_op_topk_moe(*cuda_ctx, node, weights, selected_experts, /*with norm*/ false);
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
/*delayed softmax*/ false);
i += 4;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i,
ggml_cuda_topk_moe_ops(/*with norm*/ false, /*delayed softmax*/ true), {})) {
ggml_tensor * weights = cgraph->nodes[i + 5];
ggml_tensor * ids = cgraph->nodes[i + 1];
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, ids, /*with norm*/ false,
/*delayed_softmax*/ true);
i += 5;
continue;
}
if (node->op == GGML_OP_ADD) {
int n_fuse = 0;
ggml_op ops[8];
@ -3630,8 +3642,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUM:
return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_ARGSORT:
// TODO: Support arbitrary column width
#ifndef GGML_CUDA_USE_CUB
return op->src[0]->ne[0] <= 1024;
#else
return true;
#endif
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_GROUP_NORM:

View File

@ -4,16 +4,61 @@
#include <initializer_list>
// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
template <int experts_per_thread, bool use_limit>
__device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
float max_val = -INFINITY;
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
const int idx = lane + i * WARP_SIZE;
const bool active = !use_limit || (idx < limit);
if (active) {
max_val = max(max_val, vals[i]);
}
}
max_val = warp_reduce_max(max_val);
float sum = 0.f;
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
const int idx = lane + i * WARP_SIZE;
const bool active = !use_limit || (idx < limit);
if (active) {
const float val = expf(vals[i] - max_val);
vals[i] = val;
sum += val;
} else {
vals[i] = 0.f;
}
}
sum = warp_reduce_sum(sum);
const float inv_sum = 1.0f / sum;
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
const int idx = lane + i * WARP_SIZE;
const bool active = !use_limit || (idx < limit);
if (active) {
vals[i] *= inv_sum;
}
}
}
/*
This kernel does the following:
1. softmax over the logits per token [n_experts, n_tokens]
1. optionally softmax over the logits per token [n_experts, n_tokens]
2. argmax reduce over the top-k (n_experts_used) logits
3. write weights + ids to global memory
4. optionally normalize the weights
4. optionally normalize the weights or apply softmax over the selected logits
It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
*/
template <int n_experts, bool with_norm>
template <int n_experts, bool with_norm, bool delayed_softmax = false>
__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
float * weights,
int32_t * ids,
@ -30,51 +75,31 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
constexpr int experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
float logits_r[experts_per_thread];
float wt[experts_per_thread];
#pragma unroll
for (int i = 0; i < n_experts; i += WARP_SIZE) {
const int expert = i + threadIdx.x;
logits_r[i / WARP_SIZE] = n_experts % WARP_SIZE == 0 || expert < n_experts ? logits[expert] : -INFINITY;
wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[expert] : -INFINITY;
}
float max_val = logits_r[0];
#pragma unroll
for (int i = 1; i < experts_per_thread; i++) {
const float val = logits_r[i];
max_val = max(val, max_val);
if constexpr (!delayed_softmax) {
softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
}
max_val = warp_reduce_max(max_val);
float wt[experts_per_thread];
float tmp = 0.f;
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
const float val = logits_r[i];
wt[i] = expf(val - max_val);
tmp += wt[i];
}
tmp = warp_reduce_sum(tmp);
const float inv_sum = 1.0f / tmp;
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
wt[i] = wt[i] * inv_sum;
}
//at this point, each thread holds a portion of softmax,
//we do the argmax reduce over n_expert_used, each time marking
//at this point, each thread holds either a portion of the softmax distribution
//or the raw logits. We do the argmax reduce over n_expert_used, each time marking
//the expert weight as -inf to exclude from the next iteration
float wt_sum = 0.f;
float output_weights[experts_per_thread];
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
output_weights[i] = 0.f;
}
for (int k = 0; k < n_expert_used; k++) {
float max_val = wt[0];
int max_expert = threadIdx.x;
@ -116,11 +141,15 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
wt_sum = warp_reduce_sum(wt_sum);
const float inv_sum = 1.0f / wt_sum;
for (int i = threadIdx.x; i < n_expert_used; i += WARP_SIZE) {
for (int i = 0; i < experts_per_thread; i++) {
output_weights[i] *= inv_sum;
}
}
if constexpr (delayed_softmax) {
softmax_warp_inplace<experts_per_thread, true>(output_weights, n_expert_used, threadIdx.x);
}
#pragma unroll
for (int i = 0; i < experts_per_thread; i++) {
const int idx = i * WARP_SIZE + threadIdx.x;
@ -130,7 +159,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
}
}
template <bool with_norm>
template <bool with_norm, bool delayed_softmax = false>
static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
const float * logits,
float * weights,
@ -138,6 +167,8 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
const int n_rows,
const int n_expert,
const int n_expert_used) {
static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization");
const int rows_per_block = 4;
dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
dim3 block_dims(WARP_SIZE, rows_per_block, 1);
@ -145,43 +176,43 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
switch (n_expert) {
case 1:
topk_moe_cuda<1, with_norm>
topk_moe_cuda<1, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 2:
topk_moe_cuda<2, with_norm>
topk_moe_cuda<2, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 4:
topk_moe_cuda<4, with_norm>
topk_moe_cuda<4, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 8:
topk_moe_cuda<8, with_norm>
topk_moe_cuda<8, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 16:
topk_moe_cuda<16, with_norm>
topk_moe_cuda<16, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 32:
topk_moe_cuda<32, with_norm>
topk_moe_cuda<32, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 64:
topk_moe_cuda<64, with_norm>
topk_moe_cuda<64, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 128:
topk_moe_cuda<128, with_norm>
topk_moe_cuda<128, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 256:
topk_moe_cuda<256, with_norm>
topk_moe_cuda<256, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 512:
topk_moe_cuda<512, with_norm>
topk_moe_cuda<512, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
default:
@ -194,7 +225,8 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
const ggml_tensor * logits,
ggml_tensor * weights,
ggml_tensor * ids,
const bool with_norm) {
const bool with_norm,
const bool delayed_softmax) {
GGML_ASSERT(logits->type == GGML_TYPE_F32);
GGML_ASSERT(weights->type == GGML_TYPE_F32);
GGML_ASSERT(ids->type == GGML_TYPE_I32);
@ -202,7 +234,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
const int n_experts = logits->ne[0];
const int n_rows = logits->ne[1];
const float * logits_d = (const float *) logits->src[0]->data;
const float * logits_d = (const float *) logits->data;
float * weights_d = (float *) weights->data;
int32_t * ids_d = (int32_t *) ids->data;
@ -213,7 +245,11 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
if (with_norm) {
launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
} else {
launch_topk_moe_cuda<false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
if (delayed_softmax) {
launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
} else {
launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
}
}
}
@ -246,7 +282,7 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
return true;
}
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm) {
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) {
static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SUM_ROWS, GGML_OP_DIV, GGML_OP_RESHAPE };
@ -254,8 +290,19 @@ std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm) {
static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS };
static std::initializer_list<enum ggml_op> delayed_softmax_ops = { GGML_OP_ARGSORT, GGML_OP_VIEW,
GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
GGML_ASSERT(!norm || !delayed_softmax);
if (delayed_softmax) {
return delayed_softmax_ops;
}
if (norm) {
return norm_ops;
}
return no_norm_ops;
}

View File

@ -6,9 +6,10 @@
void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
const ggml_tensor * logits,
ggml_tensor * weights,
ggml_tensor * top_k,
const bool with_norm);
ggml_tensor * ids,
const bool with_norm,
const bool delayed_softmax = false);
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights);
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm);
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);

View File

@ -0,0 +1,68 @@
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
include(ExternalProject)
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
add_library(htp_iface OBJECT
${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
set_target_properties(htp_iface PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(htp_iface PUBLIC
${HEXAGON_SDK_ROOT}/incs
${HEXAGON_SDK_ROOT}/incs/stddef
${HEXAGON_SDK_ROOT}/utils/examples
${CMAKE_CURRENT_SOURCE_DIR}/htp
${CMAKE_CURRENT_BINARY_DIR})
build_idl(htp/htp_iface.idl htp_iface)
if (CMAKE_SYSTEM_NAME MATCHES Android)
target_link_options(htp_iface PUBLIC -llog -ldl)
elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
target_precompile_headers(htp_iface PUBLIC <sal.h>)
else()
target_link_options(htp_iface PUBLIC -ldl)
endif()
link_custom_library(htp_iface cdsprpc)
link_custom_library(htp_iface rpcmem)
set(TARGET_NAME ggml-hexagon)
ggml_add_backend_library(${TARGET_NAME}
ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
# Build HTP bits
set(HTP_CMAKE_ARGS
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
-DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
-DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
ExternalProject_Add(htp-v73
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
ExternalProject_Add(htp-v75
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
ExternalProject_Add(htp-v79
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
ExternalProject_Add(htp-v81
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
# Install Hexagon skels required at runtime
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
TYPE LIB)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,448 @@
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wsign-compare"
#define GGML_COMMON_IMPL_C
#include "ggml-backend-impl.h"
#include "ggml-common.h"
#include "ggml-hexagon.h"
#include "ggml-impl.h"
#include "htp-utils.h"
#include <domain.h>
#include <remote.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
domain * get_domain(int domain_id) {
int i = 0;
int size = sizeof(supported_domains) / sizeof(domain);
for (i = 0; i < size; i++) {
if (supported_domains[i].id == domain_id) {
return &supported_domains[i];
}
}
return NULL;
}
bool is_valid_domain_id(int domain_id, int compute_only) {
int i = 0;
int size = sizeof(supported_domains) / sizeof(domain);
if (compute_only) {
return is_CDSP(domain_id);
}
for (i = 0; i < size; i++) {
if (supported_domains[i].id == domain_id) {
return true;
}
}
return false;
}
int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
int nErr = AEE_SUCCESS;
int ss_info = 0;
if (domain_type != NULL) {
if (strcmp(domain_type, "LPASS") == 0) {
ss_info = FASTRPC_LPASS;
} else if (strcmp(domain_type, "HPASS") == 0) {
ss_info = FASTRPC_HPASS;
} else {
ss_info = FASTRPC_NSP;
}
}
system_req_payload req = { 0 };
req.id = FASTRPC_GET_DOMAINS;
req.sys.domains = NULL;
fastrpc_domain * domain = NULL;
if (ss_info != 0) {
req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
} else {
req.sys.flags = 0;
}
#ifdef _WIN32
nErr = AEE_EUNSUPPORTED;
goto bail;
#endif
if (remote_system_request) {
nErr = remote_system_request(&req);
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
goto bail;
}
// Allocate memory for domain-info array
req.sys.max_domains = req.sys.num_domains;
if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
nErr = AEE_ENOMEMORY;
GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
goto bail;
}
nErr = remote_system_request(&req);
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
goto bail;
}
for (int i = 0; i < req.sys.num_domains; i++) {
// Verify that only requested type domains were returned
domain = &req.sys.domains[i];
if (domain->type != ss_info && domain_type != NULL) {
nErr = -1;
GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
goto bail;
}
}
*domains_info = req.sys.domains;
*num_domains = req.sys.num_domains;
} else {
nErr = AEE_EUNSUPPORTED;
goto bail;
}
bail:
if (nErr && !req.sys.domains) {
free(req.sys.domains);
}
return nErr;
}
int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
int err = 0;
remote_rpc_effective_domain_id_t sess = { 0 };
sess.domain_name = domain_name;
sess.domain_name_len = strlen(domain_name);
sess.session_id = session_id;
err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
if (err) {
GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
session_id);
return err;
}
*effec_domain_id = sess.effective_domain_id;
return err;
}
int get_dsp_support(int * domain) {
int nErr = AEE_SUCCESS;
*domain = CDSP_DOMAIN_ID; // DSP domain default value is CDSP_DOMAIN_ID
if (remote_handle_control) {
struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
goto bail;
}
if (dsp_capability_domain.capability == 0) {
dsp_capability_domain.domain = ADSP_DOMAIN_ID; // Check for ADSP support.
dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
dsp_capability_domain.capability = 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
sizeof(struct remote_dsp_capability));
if (dsp_capability_domain.capability) {
*domain = ADSP_DOMAIN_ID; // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
}
}
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}
int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
int nErr = AEE_SUCCESS;
*capability = 0;
if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
} else {
nErr = AEE_EBADPARM;
GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
goto bail;
}
if (remote_handle_control) {
if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for VTCM information
* Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
*/
struct remote_dsp_capability dsp_capability_vtcm_dsp;
dsp_capability_vtcm_dsp.domain = (uint32_t) domain;
dsp_capability_vtcm_dsp.attribute_ID = attr;
dsp_capability_vtcm_dsp.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (nErr == AEE_SUCCESS) {
*capability = dsp_capability_vtcm_dsp.capability;
} else {
GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("Unsupported domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}
bool is_unsignedpd_supported(int domain_id) {
int nErr = AEE_SUCCESS;
if (remote_handle_control) {
struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
return false;
}
if (nErr) {
GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
return false;
}
if (dsp_capability_domain.capability == 1) {
return true;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
return false;
}
return false;
}
bool get_unsignedpd_support(void) {
return is_unsignedpd_supported(CDSP_DOMAIN_ID);
}
bool is_async_fastrpc_supported(int domain) {
int nErr = AEE_SUCCESS;
if (remote_handle_control) {
if (domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for ASYNC_FASTRPC_SUPPORT information
* Async fastrpc is supported only on CDSP
*/
struct remote_dsp_capability dsp_capability_async_support;
dsp_capability_async_support.domain = (uint32_t) domain;
dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
dsp_capability_async_support.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (dsp_capability_async_support.capability == 1) {
return true;
}
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return false;
}
bool is_status_notification_supported(int domain) {
int nErr = AEE_SUCCESS;
if (remote_handle_control) {
/*
* Query the DSP for STATUS_NOTIFICATION_SUPPORT information
* DSP User PD status notification Support
*/
struct remote_dsp_capability dsp_capability_status_notification_support;
dsp_capability_status_notification_support.domain = (uint32_t) domain;
dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
dsp_capability_status_notification_support.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (dsp_capability_status_notification_support.capability == 1) {
return true;
}
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return false;
}
int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
int nErr = AEE_SUCCESS;
*capability = 0;
if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
nErr = AEE_EBADPARM;
GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
goto bail;
}
if (remote_handle_control) {
if (domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for HMX SUPPORT information
* HMX is supported on CDSP only
*/
struct remote_dsp_capability dsp_capability_hmx_dsp;
dsp_capability_hmx_dsp.domain = (uint32_t) domain;
dsp_capability_hmx_dsp.attribute_ID = attr;
dsp_capability_hmx_dsp.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (nErr == AEE_SUCCESS) {
*capability = dsp_capability_hmx_dsp.capability;
} else {
GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}
int get_hex_arch_ver(int domain, int * arch) {
if (!remote_handle_control) {
GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
return AEE_EUNSUPPORTEDAPI;
}
struct remote_dsp_capability arch_ver;
arch_ver.domain = (uint32_t) domain;
arch_ver.attribute_ID = ARCH_VER;
arch_ver.capability = (uint32_t) 0;
int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
return AEE_EUNSUPPORTEDAPI;
}
if (err != AEE_SUCCESS) {
GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
return err;
}
switch (arch_ver.capability & 0xff) {
case 0x73:
*arch = 73;
return 0;
case 0x75:
*arch = 75;
return 0;
case 0x79:
*arch = 79;
return 0;
case 0x81:
*arch = 81;
return 0;
}
return -1;
}
int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
int nErr = AEE_SUCCESS;
*capability = 0;
if (remote_handle_control) {
if (domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for HVX SUPPORT information
* HVX is supported on CDSP only
*/
struct remote_dsp_capability dsp_capability_hvx_dsp;
dsp_capability_hvx_dsp.domain = (uint32_t) domain;
dsp_capability_hvx_dsp.attribute_ID = attr;
dsp_capability_hvx_dsp.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (nErr == AEE_SUCCESS) {
*capability = dsp_capability_hvx_dsp.capability;
} else {
GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}

View File

@ -0,0 +1,219 @@
#ifndef HTP_UTILS_H
#define HTP_UTILS_H
#ifdef __cplusplus
extern "C" {
#endif
#include <AEEStdErr.h>
#include <inttypes.h>
#include <remote.h>
#include <stdbool.h>
/* Offset to differentiate HLOS and Hexagon error codes.
Stores the value of AEE_EOFFSET for Hexagon. */
#ifndef DSP_OFFSET
# define DSP_OFFSET 0x80000400
#endif
/* Errno for connection reset by peer. */
#ifndef ECONNRESET
# ifdef __hexagon__
# define ECONNRESET 104
# endif
#endif
/* Abstraction of different OS specific sleep APIs.
SLEEP accepts input in seconds. */
#ifndef SLEEP
# ifdef __hexagon__
# define SLEEP(x) \
{ /* Do nothing for simulator. */ \
}
# else
# ifdef _WINDOWS
# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
# else
# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
# endif
# endif
#endif
/* Include windows specific header files. */
#ifdef _WINDOWS
# include <sysinfoapi.h>
# include <windows.h>
# define _CRT_SECURE_NO_WARNINGS 1
# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
/* Including this file for custom implementation of getopt function. */
# include "getopt_custom.h"
#endif
/* Includes and defines for all HLOS except windows */
#if !defined(__hexagon__) && !defined(_WINDOWS)
# include "unistd.h"
# include <sys/time.h>
#endif
/* Includes and defines for Hexagon and all HLOS except Windows. */
#if !defined(_WINDOWS)
/* Weak reference to remote symbol for compilation. */
# pragma weak remote_session_control
# pragma weak remote_handle_control
# pragma weak remote_handle64_control
# pragma weak fastrpc_mmap
# pragma weak fastrpc_munmap
#endif
#if !defined(_WINDOWS)
# pragma weak remote_system_request
#endif
/**
* Wrapper for FastRPC Capability API: query DSP support.
*
* @param[out] domain pointer to supported domain.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*/
int get_dsp_support(int * domain);
/**
* Wrapper for FastRPC Capability API: query VTCM information.
*
* @param[in] domain value of domain in the queried.
* @param[out] capability capability value of the attribute queried.
* @param[in] attr value of the attribute to the queried.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*/
int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
/**
* Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
*
* @return true if unsigned pd is supported.
* false if unsigned pd is not supported, capability query failed.
*/
bool get_unsignedpd_support(void);
/**
* Wrapper for FastRPC Capability API: query unsigned pd support.
*
* @param[in] domain value of domain in the queried.
* @return true if unsigned pd is supported.
* false if unsigned pd is not supported, capability query failed.
*/
bool is_unsignedpd_supported(int domain_id);
/**
* is_valid_domain_id API: query a domain id is valid.
*
* @param[in] domain value of domain in the queried.
* @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
* @return true if value of domain is valid.
* false if value of domain is not valid.
*/
bool is_valid_domain_id(int domain_id, int compute_only);
/**
* get_domain API: get domain struct from domain value.
*
* @param[in] domain value of a domain
* @return Returns domain struct of the domain if it is supported or else
* returns NULL.
*
*/
domain * get_domain(int domain_id);
/**
* get_domains_info API: get information for all the domains available on the device
*
* @param[in] domain_type pointer to domain type
* @param[in] num_domains pointer to number of domains
* @param[in] domains_info pointer to save discovered domains information.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
* It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
*
*/
int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
/**
* get_effective_domain_id API: get effective domain id for given session id
*
* @param[in] domain_name pointer to domain name
* @param[in] session_id
* @param[in] effec_domain_id pointer to save obtained effective domain id.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
/**
* is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
*
* @param[in] domain_id value of a domain
* @return Returns true or false stating support of Async FastRPC
*
*/
bool is_async_fastrpc_supported(int domain_id);
/**
* is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
*
* @param[in] domain_id value of a domain
* @return Returns true or false stating status notification support information
*
*/
bool is_status_notification_supported(int domain_id);
/**
* get_hmx_support_info API: query the DSP for HMX SUPPORT information
*
* @param[in] domain_id value of a domain
* @param[out] capability capability value of the attribute queried.
* @param[in] attr value of the attribute to the queried.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
/**
* get_hex_arch_ver API: query the Hexagon processor architecture version information
*
* @param[in] domain_id value of a domain
* @param[out] Arch version (73, 75, ...)
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_hex_arch_ver(int domain, int * arch);
/**
* get_hvx_support_info API: query the DSP for HVX SUPPORT information
*
* @param[in] domain_id value of a domain
* @param[out] capability capability value of the attribute queried.
* @param[in] attr value of the attribute to the queried.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
#ifdef __cplusplus
}
#endif
#endif //DSP_CAPABILITIES_UTILS_H

View File

@ -0,0 +1,40 @@
cmake_minimum_required(VERSION 3.22.2)
project(ggml-htp C CXX ASM)
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
include_directories(
${HEXAGON_SDK_ROOT}/incs
${HEXAGON_SDK_ROOT}/incs/stddef
${CMAKE_CURRENT_SOURCE_DIR}/../..
${CMAKE_CURRENT_SOURCE_DIR}/..
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR})
set(HTP_LIB ggml-htp-${DSP_VERSION})
add_library(${HTP_LIB} SHARED
main.c
htp_iface_skel.c
worker-pool.c
htp-dma.c
hvx-sigmoid.c
hvx-inverse.c
hvx-exp.c
hvx-utils.c
matmul-ops.c
binary-ops.c
unary-ops.c
softmax-ops.c
act-ops.c
rope-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
build_idl(htp_iface.idl ${HTP_LIB})
set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
install(TARGETS ${HTP_LIB})

View File

@ -0,0 +1,448 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <qurt_thread.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
#define htp_act_preamble3 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne10 = src1->ne[0]; \
const uint32_t ne11 = src1->ne[1]; \
const uint32_t ne12 = src1->ne[2]; \
const uint32_t ne13 = src1->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb10 = src1->nb[0]; \
const uint32_t nb11 = src1->nb[1]; \
const uint32_t nb12 = src1->nb[2]; \
const uint32_t nb13 = src1->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
#define htp_act_preamble2 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * src1_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
htp_act_preamble3;
size_t src0_row_size = nb01;
size_t src1_row_size = nb11;
size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
bool src1_valid = src1->ne[0];
if (!src1_valid) {
data_src1 = data_src0;
src1_row_size = src0_row_size;
}
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
const int32_t swapped = op_params[1];
const int nc = (src1_valid) ? ne0 : ne0 / 2;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
float * restrict dst = (float *) (data_dst + (ir * dst_row_size));
if (ir + 1 < src0_end_row) {
htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
}
if (!src1_valid) {
src0 += swapped ? nc : 0;
src1 += swapped ? 0 : nc;
}
if (1 == opt_path) {
hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
(uint8_t *) dst, nc);
} else {
hvx_exp_f32((const uint8_t *) src0, src0_spad_data, nc, true);
hvx_add_scalar_f32(src0_spad_data, 1.0, src1_spad_data, nc);
hvx_inverse_f32(src1_spad_data, src0_spad_data, nc);
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, dst_spad_data, nc);
hvx_mul_f32(dst_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc);
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "swiglu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * src1_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
htp_act_preamble3;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const size_t src0_row_size = nb01;
const size_t src1_row_size = nb11;
const size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
FARF(HIGH, "act-f32: unaligned addresses in activations op, possibly slower execution\n");
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
bool src1_valid = src1->ne[0];
if (!src1_valid) {
data_src1 = data_src0;
}
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
const int32_t swapped = op_params[1];
const float alpha = ((const float *) (op_params))[2];
const float limit = ((const float *) (op_params))[3];
const int nc = (src1_valid) ? ne0 : ne0 / 2;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
float * restrict dst = (float *) (data_dst + (ir * dst_row_size));
if (ir + 1 < src0_end_row) {
htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
}
if (!src1) {
src0 += swapped ? nc : 0;
src1 += swapped ? 0 : nc;
}
// x (src0_spad_data) = std::min(src0_p[k], limit);
hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
// y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
// y (src1_spad_data) = y1 + 1.f
hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
// x1 (dst_spad_data) = alpha * (x)
hvx_mul_scalar_f32(src0_spad_data, alpha, dst_spad_data, nc);
// x2 (dst_spad_data) = expf(-x1)
hvx_exp_f32(dst_spad_data, dst_spad_data, nc, true);
// x3 (dst_spad_data) = x2 + 1.f
hvx_add_scalar_f32(dst_spad_data, 1.0, dst_spad_data, nc);
// x4 (dst_spad_data) = 1 / x3
hvx_inverse_f32(dst_spad_data, dst_spad_data, nc);
// out_glu(dst_spad_data) = x * x4
hvx_mul_f32(src0_spad_data, dst_spad_data, dst_spad_data, nc);
// out = out_glu * (y + 1.f);
hvx_mul_f32(dst_spad_data, src1_spad_data, (uint8_t *) dst, nc);
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
struct htp_tensor * dst,
const int32_t * op_params,
struct htp_spad * src0_spad,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
htp_act_preamble2;
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
int is_aligned = 1;
int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
float * restrict dst = (float *) (data_dst + (ir * dst_row_size));
if (ir + 1 < src0_end_row) {
htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
}
if (1 == opt_path) {
hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, ne0);
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
} else {
hvx_exp_f32((const uint8_t *) src0, src0_spad_data, ne0, true);
hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "silu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread);
}
static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
}
static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
&octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
}
static int execute_op_activations_fp32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) {
FARF(ERROR, "Non-contiguous tensors are not supported at this time \n");
return HTP_STATUS_NO_SUPPORT;
}
worker_callback_t act_op_func;
const char * op_type = NULL;
switch (octx->op) {
case HTP_OP_UNARY_SILU:
act_op_func = unary_silu_fp32;
op_type = "silu-f32";
break;
case HTP_OP_GLU_SWIGLU:
act_op_func = glu_swiglu_fp32;
op_type = "swiglu-f32";
break;
case HTP_OP_GLU_SWIGLU_OAI:
act_op_func = glu_swiglu_oai_fp32;
op_type = "swiglu-oai-f32";
break;
default:
FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
return HTP_STATUS_NO_SUPPORT;
}
const uint32_t n_threads = octx->n_threads;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
if (src1->ne[0]) {
FARF(HIGH,
"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
} else {
FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
}
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
}
return err;
}
int op_activations(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_activations_fp32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}

View File

@ -0,0 +1,344 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <qurt_thread.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0,
const uint8_t * src1,
uint8_t * data_dst,
const int num_elems);
static hvx_elemwise_f32_func func_table_HVX[] = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 };
static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt };
#define htp_binary_preamble \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne10 = src1->ne[0]; \
const uint32_t ne11 = src1->ne[1]; \
const uint32_t ne12 = src1->ne[2]; \
const uint32_t ne13 = src1->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb10 = src1->nb[0]; \
const uint32_t nb11 = src1->nb[1]; \
const uint32_t nb12 = src1->nb[2]; \
const uint32_t nb13 = src1->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
static void binary_job_f32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
struct htp_tensor * dst,
uint8_t * spad_data,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
enum htp_op op) {
htp_binary_preamble;
const size_t src0_row_size = nb01;
const size_t src1_row_size = nb11;
const size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src1_nrows = ne11 * ne12 * ne13; // src1 rows
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
(0 == htp_is_aligned((void *) dst->data, VLEN))) {
FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n");
is_aligned = 0;
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
hvx_elemwise_f32_func func_HVX = (1 == opt_path) ? func_table_HVX_opt[op] : func_table_HVX[op];
uint8_t * restrict spad_data_th = spad_data + (ith * src0_row_size);
const uint32_t nr0 = ne00 / ne10;
const uint8_t * restrict src0_ptr = (const uint8_t *) src0->data + (src0_start_row * src0_row_size);
uint8_t * restrict dst_ptr = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
const uint8_t * restrict src1_ptr = NULL;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
if (ir + 1 < src0_end_row) {
htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
if (src1_row_size == src0_row_size) {
htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size);
}
}
if (nr0 > 1) {
if ((1 == is_aligned) && (nr0 == ne00)) {
hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
} else {
for (uint32_t r = 0; r < nr0; r++) {
memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11);
}
}
func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00);
} else {
func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
}
src0_ptr += src0_row_size;
dst_ptr += dst_row_size;
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "binary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void binary_add_id_job_f32_per_thread(const struct htp_tensor * src0,
const struct htp_tensor * src1,
const struct htp_tensor * src2,
struct htp_tensor * dst,
uint8_t * spad_data,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread,
hvx_elemwise_f32_func func_HVX) {
htp_binary_preamble;
const size_t src0_row_size = nb01;
const size_t src1_row_size = nb11;
const size_t dst_row_size = nb1;
const uint32_t ne02_ne01 = ne02 * ne01;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
(0 == htp_is_aligned((void *) dst->data, VLEN))) {
FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n");
}
const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
// src0 indices
const uint32_t i03 = ir / ne02_ne01;
const uint32_t i02 = (ir - i03 * ne02_ne01) / ne01;
const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
// src1 indices
const int i11 = *(int32_t *) ((char *) src2->data + i01 * src2->nb[0] + i02 * src2->nb[1]);
assert(i11 >= 0 && i11 < ne11);
float * restrict dst_ptr = (float *) (data_dst + i03 * nb3 + i02 * nb2 + i01 * nb1);
const float * restrict src0_ptr = (const float *) (data_src0 + i03 * nb03 + i02 * nb02 + i01 * nb01);
const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11);
if (ir + 1 < src0_end_row) {
htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
if (src1_row_size == src0_row_size) {
htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size);
}
}
const uint32_t nr0 = ne00 / ne10;
if (nr0 > 1) {
for (uint32_t r = 0; r < nr0; r++) {
memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10);
}
func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00);
} else {
func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "add-id-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", ith, nth,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1],
dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void binary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
switch (octx->op) {
case HTP_OP_MUL:
case HTP_OP_ADD:
case HTP_OP_SUB:
binary_job_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->src1_spad.data, n, i,
octx->src0_nrows_per_thread, octx->op);
break;
case HTP_OP_ADD_ID:
binary_add_id_job_f32_per_thread(&octx->src0, &octx->src1, &octx->src2, &octx->dst, octx->src0_spad.data, n,
i, octx->src0_nrows_per_thread, hvx_add_f32);
break;
default:
FARF(ERROR, "Unknown Binary Op %u", octx->op);
break;
}
}
static int execute_op_binary_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
worker_callback_t binary_op_func;
const char * op_type = NULL;
switch (octx->op) {
case HTP_OP_MUL:
binary_op_func = binary_job_dispatcher_f32;
op_type = "mul-f32";
break;
case HTP_OP_ADD:
binary_op_func = binary_job_dispatcher_f32;
op_type = "add-f32";
break;
case HTP_OP_SUB:
binary_op_func = binary_job_dispatcher_f32;
op_type = "sub-f32";
break;
case HTP_OP_ADD_ID:
binary_op_func = binary_job_dispatcher_f32;
op_type = "add-id-f32";
break;
default:
FARF(ERROR, "Unsupported binary-Op %u\n", octx->op);
return HTP_STATUS_NO_SUPPORT;
}
const int n_threads = octx->n_threads;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src1->nb[1];
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
FARF(HIGH,
"%s: (%ux%ux%ux%u) * (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "binary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
octx->ctx->vtcm_size, spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, binary_op_func, octx, n_jobs);
}
return err;
}
int op_binary(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_binary_f32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}

View File

@ -0,0 +1,157 @@
if (HEXAGON_TOOLCHAIN_INCLUDED)
return()
endif()
set(HEXAGON_TOOLCHAIN_INCLUDED true)
#Cross Compiling for Hexagon
set(HEXAGON TRUE)
set(CMAKE_SYSTEM_NAME QURT)
set(CMAKE_SYSTEM_PROCESSOR Hexagon)
set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL})
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(CUSTOM_RUNELF_PATH "")
#To fix backward compatibility with EAI addon.
if (NOT HEXAGON_SDK_ROOT)
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
endif()
if (NOT HEXAGON_TOOLS_ROOT)
if (DEFINED ENV{HEXAGON_TOOLS_ROOT})
set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT})
endif()
if(NOT HEXAGON_TOOLS_ROOT)
set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT})
endif()
endif()
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
#Get the Binary extension of the Hexagon Toolchain
if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
endif()
message(DEBUG "CMAKE_HOST_SYSTEM_NAME:${CMAKE_HOST_SYSTEM_NAME}")
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake)
set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT})
set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib")
set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss)
set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
HEXAGON_SDK_ROOT
HEXAGON_TOOLS_ROOT
)
#QURT Related includes and linker flags
set(V_ARCH ${HEXAGON_ARCH})
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
if( ${TREE} MATCHES PAKMAN )
set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
endif()
message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
set(RTOS_DIR ${_QURT_INSTALL_DIR})
set(QCC_DIR "${HEXAGON_QCC_DIR}/${V_ARCH}/G0")
set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0")
include_directories(
${_QURT_INSTALL_DIR}/include
${_QURT_INSTALL_DIR}/include/qurt
${_QURT_INSTALL_DIR}/include/posix
)
set(QURT_START_LINK_LIBS)
set(QURT_START_LINK_LIBS
"${TARGET_DIR}/init.o"
"${RTOS_DIR}/lib/crt1.o"
"${RTOS_DIR}/lib/debugmon.o"
"${RTOS_DIR}/lib/libqurt.a"
"${TARGET_DIR}/libc.a"
"${TARGET_DIR}/libqcc.a"
"${TARGET_DIR}/libhexagon.a"
"${RTOS_DIR}/lib/libqurtcfs.a"
"${RTOS_DIR}/lib/libtimer_island.a"
"${RTOS_DIR}/lib/libtimer_main.a"
"${RTOS_DIR}/lib/libposix.a"
)
STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
set(QURT_END_LINK_LIBS
${TARGET_DIR}/fini.o
)
#Non QURT related includes and linker flags
set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
if (NOT NO_WRAP_MEM_API)
set(WRAP_MALLOC -Wl,--wrap=malloc)
set(WRAP_CALLOC -Wl,--wrap=calloc)
set(WRAP_FREE -Wl,--wrap=free)
set(WRAP_REALLOC -Wl,--wrap=realloc)
set(WRAP_MEMALIGN -Wl,--wrap=memalign)
endif()
set(PIC_SHARED_LD_FLAGS
-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
-G0
-fpic
-Wl,-Bsymbolic
-Wl,-L${TARGET_DIR_NOOS}/G0/pic
-Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/
-Wl,--no-threads ${WRAP_MALLOC} ${WRAP_CALLOC} ${WRAP_FREE} ${WRAP_REALLOC} ${WRAP_MEMALIGN}
-shared
"-o <TARGET> <SONAME_FLAG><TARGET_SONAME>"
"<LINK_FLAGS>"
-Wl,--start-group
"<OBJECTS>"
"<LINK_LIBRARIES>"
-Wl,--end-group
-lc
)
STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
#System include paths
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
#LLVM toolchain setup
#Compiler paths, options and architecture
set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
set(HEXAGON_LINKER ${CMAKE_C_COMPILER})
set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,")
set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
#Compiler Options
set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g")
set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3")
set(CMAKE_C_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g")
set(CMAKE_C_FLAGS_RELEASE "${COMMON_FLAGS} -O3")
set(CMAKE_ASM_FLAGS_DEBUG "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
set(CMAKE_ASM_FLAGS_RELEASE "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
set(CMAKE_ASM_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" )
#Linker Options
set(CMAKE_C_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")

View File

@ -0,0 +1,40 @@
#ifndef HTP_CTX_H
#define HTP_CTX_H
#include "htp-dma.h"
#include "worker-pool.h"
#include <assert.h>
#include <dspqueue.h>
#include <stdatomic.h>
#include <stdint.h>
#define HTP_MAX_NTHREADS 10
// FIXME: move these into matmul-ops
#define HTP_SPAD_SRC0_NROWS 16
#define HTP_SPAD_SRC1_NROWS 16
#define HTP_SPAD_DST_NROWS 2
// Main context for htp DSP backend
struct htp_context {
dspqueue_t queue;
dma_queue * dma[HTP_MAX_NTHREADS];
worker_pool_context_t worker_pool;
uint32_t n_threads;
int thread_id;
int thread_prio;
uint8_t * vtcm_base;
size_t vtcm_size;
uint32_t vtcm_rctx;
atomic_bool vtcm_valid;
atomic_bool vtcm_inuse;
atomic_bool vtcm_needs_release;
uint32_t opmask;
};
#endif /* HTP_CTX_H */

View File

@ -0,0 +1,69 @@
#include "htp-dma.h"
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#pragma clang diagnostic ignored "-Wunused-function"
static inline uint32_t pow2_ceil(uint32_t x) {
if (x <= 1) {
return 1;
}
int p = 2;
x--;
while (x >>= 1) {
p <<= 1;
}
return p;
}
dma_queue * dma_queue_create(size_t capacity) {
dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue));
if (q == NULL) {
FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
return NULL;
}
capacity = pow2_ceil(capacity);
memset(q, 0, sizeof(dma_queue));
q->capacity = capacity;
q->idx_mask = capacity - 1;
q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
q->dst = (void **) memalign(4, capacity * sizeof(void *));
memset(q->dst, 0, capacity * sizeof(void *));
q->tail = &q->desc[capacity - 1];
if (!q->desc && !q->dst) {
FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
return NULL;
}
FARF(HIGH, "dma-queue: capacity %u\n", capacity);
return q;
}
void dma_queue_delete(dma_queue * q) {
if (!q) {
return;
}
free(q->desc);
free(q->dst);
free(q);
}
void dma_queue_flush(dma_queue * q) {
while (1) {
uint32_t s = dmwait() & 0x3;
if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
break;
}
}
q->tail = NULL;
}

View File

@ -0,0 +1,119 @@
#ifndef HTP_DMA_H
#define HTP_DMA_H
#include <HAP_farf.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <stdbool.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
hexagon_udma_descriptor_type1_t * desc; // descriptor pointers
hexagon_udma_descriptor_type1_t * tail; // tail pointer
void ** dst; // dst pointers
uint32_t push_idx;
uint32_t pop_idx;
uint32_t capacity;
uint32_t idx_mask;
} dma_queue;
dma_queue * dma_queue_create(size_t capacity);
void dma_queue_delete(dma_queue * q);
void dma_queue_flush(dma_queue * q);
// TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
// but those do not seem to always compiler properly.
static inline void dmstart(void * next) {
asm volatile(" release(%0):at" : : "r"(next));
asm volatile(" dmstart(%0)" : : "r"(next));
}
static inline void dmlink(void * cur, void * next) {
asm volatile(" release(%0):at" : : "r"(next));
asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
}
static inline unsigned int dmpoll(void) {
unsigned int ret = 0;
asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
return ret;
}
static inline unsigned int dmwait(void) {
unsigned int ret = 0;
asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
return ret;
}
static inline bool dma_queue_push(dma_queue * q,
void * dst,
const void * src,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
return false;
}
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
desc->next = NULL;
desc->length = 0;
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
desc->dstbypass = 1;
desc->srcbypass = 1;
desc->order = 0;
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
desc->src = (void *) src;
desc->dst = (void *) dst;
desc->allocation = 0;
desc->padding = 0;
desc->roiwidth = src_row_size;
desc->roiheight = nrows;
desc->srcstride = src_row_size;
desc->dststride = dst_row_size;
desc->srcwidthoffset = 0;
desc->dstwidthoffset = 0;
q->dst[q->push_idx] = dst;
dmlink(q->tail, desc);
q->tail = desc;
// FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
q->push_idx = (q->push_idx + 1) & q->idx_mask;
return true;
}
static inline uint8_t * dma_queue_pop(dma_queue * q) {
if (q->push_idx == q->pop_idx) {
return NULL;
}
hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
// Wait for desc to complete
while (1) {
dmpoll();
if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
break;
}
// FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
}
uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
// FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
return dst;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif /* HTP_DMA_H */

View File

@ -0,0 +1,156 @@
#ifndef HTP_MSG_H
#define HTP_MSG_H
#include <assert.h>
// ggml-common.h must be included prio to this header
// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum {
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
HTP_OPMASK_QUANTIZE = (1 << 1), // Enable Quantize
HTP_OPMASK_COMPUTE = (1 << 2), // Enable Compute
};
// Op flags
enum {
HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0), // Skip dynamic quantization (reuse quantized tensors)
HTP_OPFLAGS_SKIP_COMPUTE = (1 << 1), // Skip actual computation (used for profiling)
HTP_OPFLAGS_EARLY_WAKEUP = (1 << 2) // Send early wakeup notification
};
enum htp_status {
HTP_STATUS_OK = 1,
HTP_STATUS_INTERNAL_ERR = 2,
HTP_STATUS_NO_SUPPORT = 3,
HTP_STATUS_INVAL_PARAMS = 4,
HTP_STATUS_VTCM_TOO_SMALL = 5,
};
// The values must match the ggml_type.
// Duplicated here because we can't include full ggml.h in the htp build.
// We have some static_asserts in the cpp code to ensure things are in sync.
enum htp_data_type {
HTP_TYPE_F32 = 0,
HTP_TYPE_F16 = 1,
HTP_TYPE_Q4_0 = 2,
HTP_TYPE_Q8_0 = 8,
HTP_TYPE_MXFP4 = 39,
HTP_TYPE_COUNT
};
// These values are manually translated over to HTP
// !!!! DO NOT ALTER THE ORDER OF THE FIRST FOUR ENUMS !!!!
enum htp_op {
HTP_OP_MUL = 0,
HTP_OP_ADD = 1,
HTP_OP_SUB = 2,
HTP_OP_DIV = 3,
HTP_OP_MUL_MAT = 4,
HTP_OP_MUL_MAT_ID = 5,
HTP_OP_RMS_NORM = 6,
HTP_OP_UNARY_SILU = 7,
HTP_OP_GLU_SWIGLU = 8,
HTP_OP_GLU_SWIGLU_OAI = 9,
HTP_OP_SOFTMAX = 10,
HTP_OP_ADD_ID = 11,
HTP_OP_ROPE = 12,
INVALID
};
static inline size_t htp_type_block_size(uint32_t t) {
switch (t) {
case HTP_TYPE_F32:
return 1;
case HTP_TYPE_F16:
return 1;
case HTP_TYPE_Q4_0:
return QK4_0;
case HTP_TYPE_Q8_0:
return QK8_0;
case HTP_TYPE_MXFP4:
return QK_MXFP4;
default:
assert(0 && "unsupported HTP data type");
}
return 0;
}
static inline size_t htp_type_nbytes(uint32_t t) {
switch (t) {
case HTP_TYPE_F32:
return 4;
case HTP_TYPE_F16:
return 2;
case HTP_TYPE_Q4_0:
return sizeof(block_q4_0);
case HTP_TYPE_Q8_0:
return sizeof(block_q8_0);
case HTP_TYPE_MXFP4:
return sizeof(block_mxfp4);
default:
assert(0 && "unsupported HTP data type");
}
return 0;
}
static const char * htp_type_name(uint32_t t) {
switch (t) {
case HTP_TYPE_F32:
return "fp32";
case HTP_TYPE_F16:
return "fp16";
case HTP_TYPE_Q4_0:
return "q4_0";
case HTP_TYPE_Q8_0:
return "q8_0";
case HTP_TYPE_MXFP4:
return "mxfp4";
}
return 0;
}
// Internal types
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
#define HTP_MAX_DIMS 4
struct htp_tensor {
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
uint32_t type; // Data type
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
};
#define HTP_MAX_OP_PARAMS 64
struct htp_general_req {
uint32_t op; // GGML/HTP Op
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
// Params for the op, e.g. epsilon of RMS norm
uint32_t flags; // Request flags
struct htp_tensor src0; // Input0 tensor
struct htp_tensor src1; // Input1 tensor
struct htp_tensor src2; // Input2 tensor
struct htp_tensor dst; // Output tensor
// should be multiple of 64 bytes (cacheline)
};
struct htp_general_rsp {
uint32_t op; // GGML/HTP Op
uint32_t status; // HTP_STATUS_...
uint32_t prof_usecs; // Number of usec per request
uint32_t prof_cycles; // Number of cycles per request
uint32_t prof_pkts; // Number of instruction packets per request
uint8_t unused[44]; // Pad to 64 bytes
};
#define HTP_MAX_MESSAGE_SIZE sizeof(struct htp_general_req)
#define HTP_MAX_PACKET_BUFFERS 4
#endif /* HTP_MSG_H */

View File

@ -0,0 +1,53 @@
#ifndef HTP_OPS_H
#define HTP_OPS_H
#include "htp-ctx.h"
#include "htp-msg.h"
#include "worker-pool.h"
#include <assert.h>
#include <stdint.h>
// ggml-common.h must be included prior to this header
struct htp_spad {
uint8_t * data;
size_t size;
size_t size_per_thread;
};
struct htp_ops_context {
struct htp_context * ctx;
enum htp_op op;
int32_t op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
struct htp_tensor src0;
struct htp_tensor src1;
struct htp_tensor src2;
struct htp_tensor dst;
struct htp_spad src0_spad;
struct htp_spad src1_spad;
struct htp_spad src2_spad;
struct htp_spad dst_spad;
worker_pool_context_t * wpool; // worker pool
uint32_t n_threads; // num threads
uint32_t src0_nrows_per_thread;
uint32_t src1_nrows_per_thread;
uint32_t flags;
};
int op_matmul(struct htp_ops_context * octx);
int op_matmul_id(struct htp_ops_context * octx);
int op_binary(struct htp_ops_context * octx);
int op_unary(struct htp_ops_context * octx);
int op_activations(struct htp_ops_context * octx);
int op_softmax(struct htp_ops_context * octx);
int op_add_id(struct htp_ops_context * octx);
int op_rope(struct htp_ops_context * octx);
#endif /* HTP_OPS_H */

View File

@ -0,0 +1,16 @@
// FastRPC IDL interface for GGML HTP
#ifndef HTP_IDL
#define HTP_IDL
#include "AEEStdDef.idl"
#include "remote.idl"
interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx);
AEEResult stop();
AEEResult enable_etm();
AEEResult disable_etm();
};
#endif /* HTP_IDL */

View File

@ -0,0 +1,80 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
// assert((0 == unaligned_addr) || (0 == num_elems_whole));
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector vec_out = Q6_V_vzero();
if (0 == unaligned_loop) {
HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
*p_vec_out++ = hvx_vec_exp_fp32(neg_vec_in);
} else {
*p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
}
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
} else {
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
}
}
}
if (left_over > 0) {
const float * srcf = (float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
vec_out = hvx_vec_exp_fp32(neg_vec_in);
} else {
vec_out = hvx_vec_exp_fp32(in);
}
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
}
}

View File

@ -0,0 +1,60 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
// assert((0 == unaligned_addr) || (0 == num_elems_whole));
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
}
if (0 == unaligned_loop) {
HVX_Vector * p_vec_in = (HVX_Vector *) src;
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
*p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
}
}
if (left_over > 0) {
const float * srcf = (float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = hvx_vec_inverse_fp32(in);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
}
}

View File

@ -0,0 +1,49 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
#if 0
// Reference algo used in hvx-utils
static void fast_sigmoid_f32(const float* restrict src, float* restrict dst, const int num_elems)
{
const float c1 = 0.03138777;
const float c2 = 0.276281267;
const float c_log2f = 1.442695022;
int32_t store_ints[32];
float store_floats[3][32];
for (int i = 0; i < num_elems; i++)
{
float v = src0[i];
v *= c_log2f*0.5;
int intPart = (int)v;
float x = (v - intPart);
float xx = x * x;
float v1 = c_log2f + c2 * xx;
float v2 = x + xx * c1 * x;
float v3 = (v2 + v1);
*((int*)&v3) += intPart << 24;
float v4 = v2 - v1;
float v5 = v3 - v4;
float res = v3 / v5;
dst[i] = res;
}
}
#endif

View File

@ -0,0 +1,947 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "hvx-utils.h"
#define htp_binary_ops_preamble \
int step_of_4 = num_elems >> 7; \
int step_of_2 = (num_elems - step_of_4 * VLEN_FP32 * 4) >> 6; \
int step_of_1 = (num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2) >> 5; \
int remaining = num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; \
\
const uint8_t * restrict src0_curr = src0; \
const uint8_t * restrict src1_curr = src1; \
uint8_t * restrict dst_curr = dst;
void hvx_mul_f32(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
(0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
}
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * src0f = (const float *) src0 + num_elems_whole;
const float * src1f = (const float *) src1 + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in1 = *(HVX_UVector *) src0f;
HVX_Vector in2 = *(HVX_UVector *) src1f;
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
void hvx_mul_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems) {
htp_binary_ops_preamble;
for (int i = 0; i < step_of_4; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
src0_curr += 4 * VLEN;
HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(v3a, v3b);
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
*(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v4a, v4b);
src1_curr += 4 * VLEN;
*(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
dst_curr += 4 * VLEN;
}
for (int i = 0; i < step_of_2; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
src0_curr += 2 * VLEN;
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
src1_curr += 2 * VLEN;
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
dst_curr += 2 * VLEN;
}
for (int i = 0; i < step_of_1; i++) {
HVX_Vector va = *(HVX_Vector *) src0_curr;
src0_curr += VLEN;
HVX_Vector vb = *(HVX_Vector *) src1_curr;
src1_curr += VLEN;
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(va, vb);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
dst_curr += VLEN;
}
if (remaining > 0) {
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
}
}
void hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
const uint8_t * restrict src2,
uint8_t * restrict dst,
const int num_elems) {
const uint8_t * restrict src0_curr = src0;
const uint8_t * restrict src1_curr = src1;
const uint8_t * restrict src2_curr = src2;
uint8_t * restrict dst_curr = dst;
int step_of_2 = num_elems >> 6;
int step_of_1 = (num_elems - step_of_2 * VLEN_FP32 * 2) >> 5;
int remaining = num_elems - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32;
for (int i = 0; i < step_of_2; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v1c = *(HVX_Vector *) src2_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1_ = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1_), v1c);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
HVX_Vector v2c = *(HVX_Vector *) (src2_curr + VLEN);
src0_curr += 2 * VLEN;
HVX_Vector v2_ = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2_), v2c);
src1_curr += 2 * VLEN;
src2_curr += 2 * VLEN;
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
dst_curr += 2 * VLEN;
}
for (int i = 0; i < step_of_1; i++) {
HVX_Vector va = *(HVX_Vector *) src0_curr;
src0_curr += VLEN;
HVX_Vector vb = *(HVX_Vector *) src1_curr;
src1_curr += VLEN;
HVX_Vector vc = *(HVX_Vector *) src2_curr;
src2_curr += VLEN;
HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(va, vb);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), vc);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v2);
dst_curr += VLEN;
}
if (remaining > 0) {
HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), *(HVX_Vector *) src2_curr);
hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v2));
}
}
void hvx_add_f32(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
(0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
}
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * src0f = (const float *) src0 + num_elems_whole;
const float * src1f = (const float *) src1 + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in1 = *(HVX_UVector *) src0f;
HVX_Vector in2 = *(HVX_UVector *) src1f;
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
void hvx_add_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems) {
htp_binary_ops_preamble;
for (int i = 0; i < step_of_4; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
src0_curr += 4 * VLEN;
HVX_Vector v3 = Q6_Vqf32_vadd_VsfVsf(v3a, v3b);
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
*(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
HVX_Vector v4 = Q6_Vqf32_vadd_VsfVsf(v4a, v4b);
src1_curr += 4 * VLEN;
*(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
dst_curr += 4 * VLEN;
}
for (int i = 0; i < step_of_2; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
src0_curr += 2 * VLEN;
HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
src1_curr += 2 * VLEN;
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
dst_curr += 2 * VLEN;
}
for (int i = 0; i < step_of_1; i++) {
HVX_Vector va = *(HVX_Vector *) src0_curr;
src0_curr += VLEN;
HVX_Vector vb = *(HVX_Vector *) src1_curr;
src1_curr += VLEN;
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(va, vb);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
dst_curr += VLEN;
}
if (remaining > 0) {
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
}
}
void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
void hvx_sub_f32(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
(0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
}
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * src0f = (const float *) src0 + num_elems_whole;
const float * src1f = (const float *) src1 + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in1 = *(HVX_UVector *) src0f;
HVX_Vector in2 = *(HVX_UVector *) src1f;
HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
void hvx_sub_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems) {
htp_binary_ops_preamble;
for (int i = 0; i < step_of_4; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
src0_curr += 4 * VLEN;
HVX_Vector v3 = Q6_Vqf32_vsub_VsfVsf(v3a, v3b);
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
*(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
HVX_Vector v4 = Q6_Vqf32_vsub_VsfVsf(v4a, v4b);
src1_curr += 4 * VLEN;
*(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
dst_curr += 4 * VLEN;
}
for (int i = 0; i < step_of_2; i++) {
HVX_Vector v1a = *(HVX_Vector *) src0_curr;
HVX_Vector v1b = *(HVX_Vector *) src1_curr;
HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
src0_curr += 2 * VLEN;
HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
src1_curr += 2 * VLEN;
*(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
dst_curr += 2 * VLEN;
}
for (int i = 0; i < step_of_1; i++) {
HVX_Vector va = *(HVX_Vector *) src0_curr;
src0_curr += VLEN;
HVX_Vector vb = *(HVX_Vector *) src1_curr;
src1_curr += VLEN;
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(va, vb);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
dst_curr += VLEN;
}
if (remaining > 0) {
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
}
}
void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector val_vec = hvx_vec_splat_fp32(val);
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
if (0 == htp_is_aligned((void *) src, VLEN)) {
FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
}
assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
vec_in1++;
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
HVX_Vector vec_left = *(HVX_UVector *) srcf;
HVX_Vector vec_left_sq = Q6_Vqf32_vmpy_VsfVsf(vec_left, vec_left);
HVX_Vector vec_tmp = Q6_V_valign_VVR(vec_left_sq, zero_vec, left_over * SIZEOF_FP32);
sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, vec_tmp);
}
HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec_acc);
return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
}
float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if (0 == htp_is_aligned((void *) src, VLEN)) {
FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
if (0 == unaligned_loop) {
HVX_Vector * vec_in = (HVX_Vector *) src;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
// sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), in);
}
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
HVX_Vector vec_left = *(HVX_UVector *) srcf;
HVX_Vector vec_tmp = Q6_V_valign_VVR(vec_left, zero_vec, left_over * SIZEOF_FP32);
// sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, vec_tmp);
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), vec_tmp);
}
HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec);
return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
}
void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
if (0 == unaligned_loop) {
HVX_Vector * vec_in1 = (HVX_Vector *) src;
HVX_Vector * vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
*vec_out++ = Q6_Vsf_equals_Vqf32(v);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
}
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
}
}
float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if (0 == htp_is_aligned((void *) src, VLEN)) {
FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
}
HVX_Vector vec_max = hvx_vec_splat_fp32(((const float *) src)[0]);
HVX_Vector vec_first = hvx_vec_splat_fp32(((const float *) src)[0]);
if (0 == unaligned_loop) {
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, in);
}
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector temp = Q6_V_valign_VVR(in, vec_first, left_over * SIZEOF_FP32);
vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, temp);
}
HVX_Vector v = hvx_vec_reduce_max_fp32(vec_max);
return hvx_vec_get_fp32(v);
}
void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
}
assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
const float * src_f = (const float *) src;
HVX_Vector vec_min = Q6_V_vsplat_R(val);
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
*vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
}
}
void hvx_clamp_scalar_f32(const uint8_t * restrict src,
const float limit_left,
const float limit_right,
uint8_t * restrict dst,
const int num_elems) {
size_t left_over = num_elems & (VLEN_FP32 - 1);
size_t num_elems_whole = num_elems - left_over;
if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
}
assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
HVX_Vector * restrict vec_in = (HVX_Vector *) src;
HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
HVX_Vector range_left = hvx_vec_splat_fp32(limit_left);
HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in_vec = *vec_in++;
HVX_Vector temp_v = in_vec;
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
*vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
}
if (left_over > 0) {
const float * srcf = (const float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector temp_v = in;
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
}
}

View File

@ -0,0 +1,998 @@
#ifndef HVX_UTILS_H
#define HVX_UTILS_H
#include "ops-utils.h"
#include <stdbool.h>
#include <stdint.h>
#define SIZEOF_FP32 (4)
#define SIZEOF_FP16 (2)
#define VLEN (128)
#define VLEN_FP32 (VLEN / SIZEOF_FP32)
#define VLEN_FP16 (VLEN / SIZEOF_FP16)
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
union {
float f;
int32_t i;
} fp32 = { .f = i };
return Q6_V_vsplat_R(fp32.i);
}
static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
// Rotate as needed.
v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
uint32_t left_off = (size_t) addr & 127;
uint32_t right_off = left_off + n;
HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
if (right_off > 128) {
Q6_vmem_QRIV(qr, (HVX_Vector *) addr + 1, v);
// all 1's
qr = Q6_Q_vcmp_eq_VbVb(v, v);
}
ql_not = Q6_Q_or_QQn(ql_not, qr);
Q6_vmem_QnRIV(ql_not, (HVX_Vector *) addr, v);
}
static inline void hvx_vec_store_a(void * ptr, size_t n, HVX_Vector v) {
assert((unsigned long) ptr % 128 == 0);
HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) ptr);
HVX_VectorPred qr = Q6_Q_vsetq2_R(n);
ql_not = Q6_Q_or_QQn(ql_not, qr);
Q6_vmem_QnRIV(ql_not, (HVX_Vector *) ptr, v);
}
static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
// vdelta control to replicate first 4 bytes across all elements
static const uint8_t __attribute__((aligned(128))) repl[128] = {
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
};
HVX_Vector ctrl = *(HVX_Vector *) repl;
return Q6_V_vdelta_VV(v, ctrl);
}
// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
HVX_Vector * restrict vsrc = (HVX_Vector *) src;
assert((unsigned long) dst % 128 == 0);
assert((unsigned long) src % 128 == 0);
uint32_t nvec = n / 64;
uint32_t nloe = n % 64;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
HVX_Vector v = vsrc[i];
vdst[i] = v;
}
if (nloe) {
HVX_Vector v = vsrc[i];
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
}
}
// copy n fp16 elements : source is aligned, destination is potentially unaligned
static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
HVX_UVector * restrict vdst = (HVX_UVector *) dst;
HVX_Vector * restrict vsrc = (HVX_Vector *) src;
assert((unsigned long) src % 128 == 0);
uint32_t nvec = n / 64;
uint32_t nloe = n % 64;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
HVX_Vector v = vsrc[i];
vdst[i] = v;
}
if (nloe) {
HVX_Vector v = vsrc[i];
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
}
}
// copy n fp16 elements : source is aligned, destination is potentially unaligned
static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
HVX_UVector * restrict vsrc = (HVX_UVector *) src;
assert((unsigned long) dst % 128 == 0);
uint32_t nvec = n / 64;
uint32_t nloe = n % 64;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
HVX_Vector v = vsrc[i];
vdst[i] = v;
}
if (nloe) {
HVX_Vector v = vsrc[i];
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
}
}
// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
HVX_Vector * restrict vsrc = (HVX_Vector *) src;
assert((unsigned long) dst % 128 == 0);
assert((unsigned long) src % 128 == 0);
uint32_t nvec = n / 32;
uint32_t nloe = n % 32;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
HVX_Vector v = vsrc[i];
vdst[i] = v;
}
if (nloe) {
HVX_Vector v = vsrc[i];
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
}
}
// copy n fp32 elements : source is aligned, destination is unaligned
static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
HVX_UVector * restrict vdst = (HVX_UVector *) dst;
HVX_Vector * restrict vsrc = (HVX_Vector *) src;
assert((unsigned long) src % 128 == 0);
uint32_t nvec = n / 32;
uint32_t nloe = n % 32;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
HVX_Vector v = vsrc[i];
vdst[i] = v;
}
if (nloe) {
HVX_Vector v = vsrc[i];
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
}
}
// copy n fp32 elements : source is unaligned, destination is aligned
static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
HVX_UVector * restrict vsrc = (HVX_UVector *) src;
assert((unsigned long) dst % 128 == 0);
uint32_t nvec = n / 32;
uint32_t nloe = n % 32;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
HVX_Vector v = vsrc[i];
vdst[i] = v;
}
if (nloe) {
HVX_Vector v = vsrc[i];
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
}
}
// bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
HVX_Vector velem = hvx_vec_splat_fp32(elem);
assert((unsigned long) dst % 128 == 0);
uint32_t nvec = n / 32;
uint32_t nloe = n % 32;
uint32_t i = 0;
#pragma unroll(4)
for (; i < nvec; i++) {
vdst[i] = velem;
}
if (nloe) {
hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), velem);
}
}
static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
uint32_t left_off = (size_t) addr & (chunk_size - 1);
uint32_t right_off = left_off + n;
return right_off <= chunk_size;
}
static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
union {
HVX_Vector v;
__fp16 d[64];
} u = { .v = v };
const uint32_t n0 = n / 16;
const uint32_t n1 = n % 16;
int i = 0;
for (; i < n0; i++) {
htp_dump_fp16_line(pref, u.d + (16 * i), 16);
}
if (n1) {
htp_dump_fp16_line(pref, u.d + (16 * i), n1);
}
}
static void hvx_vec_dump_fp16(char * pref, HVX_Vector v) {
hvx_vec_dump_fp16_n(pref, v, 64);
}
static void hvx_vec_dump_fp32_n(char * pref, HVX_Vector v, uint32_t n) {
union {
HVX_Vector v;
float d[32];
} u = { .v = v };
const uint32_t n0 = n / 16;
const uint32_t n1 = n % 16;
int i = 0;
for (; i < n0; i++) {
htp_dump_fp32_line(pref, u.d + (16 * i), 16);
}
if (n1) {
htp_dump_fp32_line(pref, u.d + (16 * i), n1);
}
}
static void hvx_vec_dump_fp32_hmt(char * pref, HVX_Vector v) {
union {
HVX_Vector v;
float d[32];
} u = { .v = v };
FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
}
static void hvx_vec_dump_fp32(char * pref, HVX_Vector v) {
hvx_vec_dump_fp32_n(pref, v, 32);
}
static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
union {
HVX_Vector v;
int32_t d[32];
} u = { .v = v };
for (int i = 0; i < 32 / 16; i++) {
htp_dump_int32_line(pref, u.d + (16 * i), 16);
}
}
static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
union {
HVX_Vector v;
int32_t d[32];
} u = { .v = v };
FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
}
static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
union {
HVX_Vector v;
int8_t d[128];
} u = { .v = v };
FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
}
static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
union {
HVX_Vector v;
int8_t d[128];
} u = { .v = v };
for (int i = 0; i < 128 / 16; i++) {
htp_dump_int8_line(pref, u.d + (16 * i), 16);
}
}
static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
union {
HVX_Vector v;
uint8_t d[128];
} u = { .v = v };
for (int i = 0; i < 128 / 16; i++) {
htp_dump_uint8_line(pref, u.d + (16 * i), 16);
}
}
static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
typedef union {
HVX_Vector v;
int8_t d[128];
} U;
U u0 = { .v = v0 };
U u1 = { .v = v1 };
for (int i = 0; i < n; i++) {
if (u0.d[i] != u1.d[i]) {
return false;
}
}
return true;
}
static inline float hvx_vec_get_fp32(HVX_Vector v) {
float __attribute__((aligned(128))) x;
hvx_vec_store_a(&x, 4, v);
return x;
}
static inline HVX_Vector hvx_vec_int32_reduce_sum_n(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // int32
HVX_Vector sum = in, sum_t;
while (width < total) {
sum_t = Q6_V_vror_VR(sum, width); // rotate right
sum = Q6_Vw_vadd_VwVw(sum_t, sum); // elementwise sum
width = width << 1;
}
return sum;
}
static inline HVX_Vector hvx_vec_int32_reduce_sum(HVX_Vector in) {
return hvx_vec_int32_reduce_sum_n(in, 32);
}
static inline HVX_Vector hvx_vec_qf32_reduce_sum_n(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // fp32 nbytes
HVX_Vector sum = in, sum_t;
while (width < total) {
sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width); // rotate right
sum = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t); // elementwise sum
width = width << 1;
}
return sum;
}
static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) {
return hvx_vec_qf32_reduce_sum_n(in, 32);
}
static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // fp32 nbytes
HVX_Vector sum = in, sum_t;
while (width < total) {
sum_t = Q6_V_vror_VR(sum, width); // rotate right
sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
width = width << 1;
}
return sum;
}
static inline HVX_Vector hvx_vec_fp32_reduce_sum(HVX_Vector in) {
return hvx_vec_fp32_reduce_sum_n(in, 32);
}
static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) {
unsigned total = 128; // total vec nbytes
unsigned width = 2; // fp16 nbytes
HVX_Vector _max = in, _max_t;
while (width < total) {
_max_t = Q6_V_vror_VR(_max, width); // rotate right
_max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max
width = width << 1;
}
return _max;
}
static inline HVX_Vector hvx_vec_reduce_max2_fp16(HVX_Vector in, HVX_Vector _max) {
unsigned total = 128; // total vec nbytes
unsigned width = 2; // fp32 nbytes
HVX_Vector _max_t;
_max = Q6_Vhf_vmax_VhfVhf(in, _max);
while (width < total) {
_max_t = Q6_V_vror_VR(_max, width); // rotate right
_max = Q6_Vhf_vmax_VhfVhf(_max_t, _max); // elementwise max
width = width << 1;
}
return _max;
}
static inline HVX_Vector hvx_vec_reduce_max_fp32(HVX_Vector in) {
unsigned total = 128; // total vec nbytes
unsigned width = 4; // fp32 nbytes
HVX_Vector _max = in, _max_t;
while (width < total) {
_max_t = Q6_V_vror_VR(_max, width); // rotate right
_max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max
width = width << 1;
}
return _max;
}
static inline HVX_Vector hvx_vec_reduce_max2_fp32(HVX_Vector in, HVX_Vector _max) {
unsigned total = 128; // total vec nbytes
unsigned width = 4; // fp32 nbytes
HVX_Vector _max_t;
_max = Q6_Vsf_vmax_VsfVsf(in, _max);
while (width < total) {
_max_t = Q6_V_vror_VR(_max, width); // rotate right
_max = Q6_Vsf_vmax_VsfVsf(_max_t, _max); // elementwise max
width = width << 1;
}
return _max;
}
static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
// abs by clearing the fp16 sign bit
HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
return Q6_V_vand_VV(v, mask);
}
static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
// neg by setting the fp16 sign bit
HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
return Q6_V_vor_VV(v, mask);
}
static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
// abs by clearing the fp32 sign bit
HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
return Q6_V_vand_VV(v, mask);
}
static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
#if __HTP_ARCH__ > 75
return Q6_Vsf_vfneg_Vsf(v);
#else
// neg by setting the fp32 sign bit
HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
return Q6_V_vor_VV(v, mask);
#endif // __HTP_ARCH__ > 75
}
// ====================================================
// FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5
// Order:3; continuity: True; Ends forced: True
// Mode: unsigned; Result fractional bits: 14
// Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05
// 32769 -32706 31252 -10589
// 32590 -30635 22793 -4493
// 32066 -27505 16481 -2348
// 31205 -24054 11849 -1306
static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
// input is 0..0xffff representing 0.0 .. 1.0
HVX_Vector p;
p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
return p; // signed result, 14 fractional bits
}
// Find reciprocal of fp16.
// (1) first, convert to fp32, multiplying by 1.0; this is done to
// handle denormals. Ignoring sign and zero, result should be at
// least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
// (exponent in range [103,143])
// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
// (4) convert that to fp16
// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
// the result with the max value.
static inline HVX_Vector hvx_vec_inverse_fp16(HVX_Vector vals) {
HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF);
HVX_Vector avals = Q6_V_vand_VV(vals, em_mask);
HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals);
// is too small to 1/x ? for 'standard' fp16, this would be 0x101
HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0
HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
// bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
// likewise extract the upper 16 from each, containing the exponents in range 103..142
HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
//Get exponent in IEEE 32-bit representation
exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
// so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
// We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
// Use poly to transform to 1/x, with 14 fractional bits
//
HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
// Get mantissa for 16-bit represenation
HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
//Compute Reciprocal Exponent
HVX_Vector exp_recip =
Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
//Convert it for 16-bit representation
exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
//Merge exponent and mantissa for reciprocal
HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
// map 'small' inputs to standard largest value 0x7bff
recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
// add sign back
recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
return recip;
}
#define IEEE_VSF_EXPLEN (8)
#define IEEE_VSF_EXPBIAS (127)
#define IEEE_VSF_EXPMASK (0xFF)
#define IEEE_VSF_MANTLEN (23)
#define IEEE_VSF_MANTMASK (0x7FFFFF)
#define IEEE_VSF_MIMPMASK (0x800000)
static inline HVX_Vector hvx_vec_truncate_fp32(HVX_Vector in_vec) {
HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
HVX_Vector const_zero_v = Q6_V_vzero();
HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
expval_v &= IEEE_VSF_EXPMASK;
expval_v -= IEEE_VSF_EXPBIAS;
// negative exp == fractional value
HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v; // fractional bits - exp shift
HVX_Vector mant_v = in_vec & mask_mant_v; // obtain mantissa
HVX_Vector vout = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v); // add implicit 1.0
vout = Q6_Vw_vasr_VwVw(vout, rshift_v); // shift to obtain truncated integer
vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout); // expval<0 -> 0
HVX_Vector neg_vout = -vout;
vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout); // handle negatives
return (vout);
}
static inline HVX_Vector hvx_vec_floor_fp32(HVX_Vector in_vec) {
HVX_Vector mask_mant_v = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
HVX_Vector mask_impl_v = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
HVX_Vector const_mnlen_v = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
HVX_Vector const_zero_v = Q6_V_vzero();
HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000); // -1 IEEE vsf
HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
expval_v &= IEEE_VSF_EXPMASK;
expval_v -= IEEE_VSF_EXPBIAS;
HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
HVX_VectorPred q_expltmn = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
// if expval < 0 (q_negexp) // <0, floor is 0
// if vin > 0
// floor = 0
// if vin < 0
// floor = -1
// if expval < mant_len (q_expltmn) // >0, but fraction may exist
// get sign (q_negative)
// mask >> expval // fraction bits to mask off
// vout = ~(mask) // apply mask to remove fraction
// if (qneg) // negative floor is one less (more, sign bit for neg)
// vout += ((impl_mask) >> expval)
// if (mask && vin)
// vout = vin
// else // already an integer
// ; // no change
// compute floor
mask_mant_v >>= expval_v;
HVX_Vector neg_addin_v = mask_impl_v >> expval_v;
HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
HVX_Vector vout = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
HVX_Vector mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v); // chk if bits set
HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v); // frac bits to clear
HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v); // clear frac bits
vout = in_vec;
vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout); // expval<mant
vout = Q6_V_vmux_QVV(q_integral, in_vec, vout); // integral values
vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout); // expval<0 x>0 -> 0
vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout); // expval<0 x<0 -> -1
return vout;
}
static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
// This looks complicated.
// Ideally should just be Q6_Vh_equals_Vhf(vin)
// but that instruction does not do proper rounding.
// convert to qf32, multiplying by 1.0 in the process.
HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
// 'in-range' values are +/32752.
// add 192K to it, convert to sf
HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
// for in-range cases, result is {163858... 229360} so the exponent is always 144.
// if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
// Start by <<10 to get the final 'sign' bit in bit 15...
vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
// now round down to 16
return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
}
static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
HVX_Vector two_sf = hvx_vec_splat_fp32(2.0);
// First approximation
HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
HVX_Vector r_qf;
// Refine
r_qf = Q6_Vqf32_vmpy_VsfVsf(
i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
return Q6_Vsf_equals_Vqf32(r_qf);
}
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
#define FAST_SIGMOID_C3 (0x3f000000) // 0.5
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
HVX_Vector in_int = hvx_vec_truncate_fp32(Q6_Vsf_equals_Vqf32(v));
HVX_Vector x = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
HVX_Vector xx = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
v1 = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
v2 = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
v2 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
HVX_Vector v3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
v3_exponent = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
v3_exponent = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
v3 = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
HVX_Vector res = hvx_vec_inverse_fp32(v5);
res = Q6_Vqf32_vmpy_VsfVsf(v3, res);
return Q6_Vsf_equals_Vqf32(res);
}
#define EXP_COEFF_5 (0x39506967) // 0.000198757 = 1/(7!)
#define EXP_COEFF_4 (0x3AB743CE) // 0.0013982 = 1/(6!)
#define EXP_COEFF_3 (0x3C088908) // 0.00833345 = 1/(5!)
#define EXP_COEFF_2 (0x3D2AA9C1) // 0.416658 = 1/(4!)
#define EXP_COEFF_1 (0x3E2AAAAA) // 0.16666667 = 1/(3!)
#define EXP_COEFF_0 (0x3F000000) // 0.5 = 1/(2!)
#define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805
#define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408
#define EXP_ONE (0x3f800000) // 1.0
#define EXP_RANGE_R (0x41a00000) // 20.0
#define EXP_RANGE_L (0xc1a00000) // -20.0
static inline HVX_Vector hvx_vec_exp_fp32(HVX_Vector in_vec) {
HVX_Vector z_qf32_v;
HVX_Vector x_v;
HVX_Vector x_qf32_v;
HVX_Vector y_v;
HVX_Vector k_v;
HVX_Vector f_v;
HVX_Vector epsilon_v;
HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
HVX_Vector E_const;
HVX_Vector zero_v = Q6_V_vzero();
// exp(x) is approximated as follows:
// f = floor(x/ln(2)) = floor(x*log2(e))
// epsilon = x - f*ln(2)
// exp(x) = exp(epsilon+f*ln(2))
// = exp(epsilon)*exp(f*ln(2))
// = exp(epsilon)*2^f
//
// Since epsilon is close to zero, it can be approximated with its Taylor series:
// exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
// Preserving the first eight elements, we get:
// exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
// = 1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
HVX_Vector temp_v = in_vec;
// Clamp inputs to (-20.0, 20.0)
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
// f_v is the floating point result and k_v is the integer result
f_v = hvx_vec_floor_fp32(epsilon_v);
k_v = hvx_vec_truncate_fp32(f_v);
x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
// x = x - f_v * logn2;
epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
x_qf32_v = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
// normalize before every QFloat's vmpy
x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
// z = x * x;
z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
// y = E4 + E5 * x;
E_const = Q6_V_vsplat_R(EXP_COEFF_5);
y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_4);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
// y = E3 + y * x;
E_const = Q6_V_vsplat_R(EXP_COEFF_3);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
// y = E2 + y * x;
E_const = Q6_V_vsplat_R(EXP_COEFF_2);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
// y = E1 + y * x;
E_const = Q6_V_vsplat_R(EXP_COEFF_1);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
// y = E0 + y * x;
E_const = Q6_V_vsplat_R(EXP_COEFF_0);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
// y = x + y * z;
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
// y = y + 1.0;
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
// insert exponents
// y = ldexpf(y, k);
// y_v += k_v; // qf32
// modify exponent
y_v = Q6_Vsf_equals_Vqf32(y_v);
// add k_v to the exponent of y_v
HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
// exponent cannot be negative; if overflow is detected, result is set to zero
HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
return y_v;
}
#define RSQRT_CONST 0x5f3759df // Constant for fast inverse square root calculation
#define RSQRT_ONE_HALF 0x3f000000 // 0.5
#define RSQRT_THREE_HALVES 0x3fc00000 // 1.5
static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
//Algorithm :
// x2 = input*0.5
// y = * (long *) &input
// y = 0x5f3759df - (y>>2)
// y = y*(threehalfs - x2*y*y)
HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
HVX_Vector onehalf = Q6_V_vsplat_R(RSQRT_ONE_HALF);
HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
HVX_Vector x2, y, ypower2, temp;
x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
y = Q6_Vw_vasr_VwR(in_vec, 1);
y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
// 1st iteration
ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
temp = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
// 2nd iteration
y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
// 3rd iteration
y = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
temp = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
temp = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
temp = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
return Q6_Vsf_equals_Vqf32(temp);
}
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
int step_of_1 = num_elems >> 5;
int remaining = num_elems - step_of_1 * VLEN_FP32;
assert(remaining == 0);
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
}
}
float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
void hvx_mul_f32(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems);
void hvx_mul_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems);
void hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
const uint8_t * restrict src2,
uint8_t * restrict dst,
const int num_elems);
void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
void hvx_add_f32(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems);
void hvx_add_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems);
void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
void hvx_sub_f32(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems);
void hvx_sub_f32_opt(const uint8_t * restrict src0,
const uint8_t * restrict src1,
uint8_t * restrict dst,
const int num_elems);
void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale);
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems);
float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems);
void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
void hvx_clamp_scalar_f32(const uint8_t * restrict src,
const float limit_left,
const float limit_right,
uint8_t * restrict dst,
const int num_elems);
#endif /* HVX_UTILS_H */

View File

@ -0,0 +1,945 @@
#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
#pragma clang diagnostic ignored "-Wunused-function"
#define FARF_ERROR 1
#define FARF_HIGH 1
#define FARF_MEDIUM 0
#define FARF_LOW 0
#include <AEEStdErr.h>
#include <dspqueue.h>
#include <HAP_compute_res.h>
#include <HAP_etm_config.h>
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_power.h>
#include <HAP_ps.h>
#include <qurt.h>
#include <qurt_thread.h>
#include <remote.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "ops-utils.h"
#include "worker-pool.h"
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
struct htp_context * ctx;
int err = 0;
ctx = calloc(1, sizeof(*ctx));
if (ctx == NULL) {
return AEE_ENOMEMORY;
}
// Use the context structure as a handle
*handle = (remote_handle64) ctx;
// Enable FARF logs
HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
// Set client class
{
HAP_power_request_t request;
memset(&request, 0, sizeof(HAP_power_request_t));
request.type = HAP_power_set_apptype;
request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
return err;
}
}
{
HAP_power_request_t request;
memset(&request, 0, sizeof(request));
request.type = HAP_power_set_DCVS_v3;
request.dcvs_v3.set_dcvs_enable = TRUE;
request.dcvs_v3.dcvs_enable = TRUE;
request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
request.dcvs_v3.set_bus_params = TRUE;
request.dcvs_v3.bus_params.min_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.bus_params.max_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.set_core_params = TRUE;
request.dcvs_v3.core_params.min_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.max_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
request.dcvs_v3.set_sleep_disable = TRUE;
request.dcvs_v3.sleep_disable = TRUE;
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
return err;
}
memset(&request, 0, sizeof(request));
request.type = HAP_power_set_HVX;
request.hvx.power_up = TRUE;
if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
return err;
}
}
{
// Power on HMX
HAP_power_request_t request;
memset(&request, 0, sizeof(HAP_power_request_t));
request.type = HAP_power_set_HMX;
request.hmx.power_up = TRUE;
FARF(ALWAYS, "Powering HMX on\n");
err = HAP_power_set((void *) &ctx, &request);
if (err != AEE_SUCCESS) {
FARF(ERROR, "Error powering on HMX.");
return err;
}
}
return AEE_SUCCESS;
}
AEEResult htp_iface_close(remote_handle64 handle) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
}
if (ctx->queue) {
FARF(ERROR, "Closing handle with queue still open");
return AEE_EITEMBUSY;
}
free(ctx);
return AEE_SUCCESS;
}
AEEResult htp_iface_enable_etm(remote_handle64 handle) {
int err = HAP_user_etm_enable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_disable_etm(remote_handle64 handle) {
int err = HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
}
}
return err;
}
static int vtcm_acquire(struct htp_context * ctx) {
if (!ctx->vtcm_valid) {
// Temporarily bump thread priority to make sure it's higher than other sessions.
// This way the resource manager will notify the other thread to release VTCM.
// Note that we need to reaquire VTCM at normal priority for this to work next time.
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
HAP_compute_res_release_cached(ctx->vtcm_rctx);
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
ctx->vtcm_valid = true;
}
ctx->vtcm_inuse = true;
return 0;
}
static int vtcm_release(struct htp_context * ctx) {
ctx->vtcm_inuse = false;
if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
ctx->vtcm_valid = false;
ctx->vtcm_needs_release = false;
HAP_compute_res_release_cached(ctx->vtcm_rctx);
}
return 0;
}
static int vtcm_release_callback(unsigned int rctx, void * state) {
struct htp_context * ctx = (struct htp_context *) state;
if (!ctx || ctx->vtcm_rctx != rctx) {
return AEE_EBADPARM;
}
// If VTCM is not inuse (not processing Ops) release it right here
// otherwise we'll release it once we're done with the current Op.
if (ctx->vtcm_inuse) {
ctx->vtcm_needs_release = false;
return 0;
}
ctx->vtcm_valid = false;
HAP_compute_res_release_cached(ctx->vtcm_rctx);
return 0;
}
static int vtcm_alloc(struct htp_context * ctx) {
unsigned int vtcm_size = 8 * 1024 * 1024; // 8MB default
HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
compute_res_attr_t attr;
HAP_compute_res_attr_init(&attr);
HAP_compute_res_attr_set_serialize(&attr, 0);
HAP_compute_res_attr_set_cache_mode(&attr, 1);
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size);
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
HAP_compute_res_attr_set_hmx_param(&attr, 1);
// Allocate VTCM for scratch pads
uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
if (!rctx) {
FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
return AEE_ENOMEMORY;
}
void * vtcm_ptr;
if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
HAP_compute_res_release(rctx);
FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
return AEE_ENOMEMORY;
}
ctx->vtcm_base = (uint8_t *) vtcm_ptr;
ctx->vtcm_size = vtcm_size;
ctx->vtcm_rctx = rctx;
ctx->vtcm_valid = false;
ctx->vtcm_inuse = false;
ctx->vtcm_needs_release = false;
return 0;
}
static void vtcm_free(struct htp_context * ctx) {
if (ctx->vtcm_rctx) {
HAP_compute_res_release(ctx->vtcm_rctx);
ctx->vtcm_base = 0;
ctx->vtcm_rctx = 0;
}
}
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
static void htp_error_callback(dspqueue_t queue, int error, void * context);
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
}
if (ctx->queue) {
FARF(ERROR, "Queue already open");
return AEE_EITEMBUSY;
}
// Import queue created on the CPU
int err = dspqueue_import(dsp_queue_id, // Queue ID from dspqueue_export
htp_packet_callback, // Packet callback
htp_error_callback, // Error callback; no errors expected on the DSP
(void *) ctx, // Callback context
&ctx->queue);
if (err) {
FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
return err;
}
ctx->thread_id = qurt_thread_get_id();
ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
// allocate VTCM
err = vtcm_alloc(ctx);
if (err != AEE_SUCCESS) {
FARF(ERROR, "Unable to allocate VTCM");
return AEE_ENOMEMORY;
}
qurt_sysenv_max_hthreads_t hw_threads;
qurt_sysenv_get_max_hw_threads(&hw_threads);
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
if (n_hvx == 0) {
n_hvx = hw_nhvx;
}
if (n_hvx > hw_threads.max_hthreads) {
n_hvx = hw_threads.max_hthreads;
}
if (n_hvx > HTP_MAX_NTHREADS) {
n_hvx = HTP_MAX_NTHREADS;
}
ctx->n_threads = n_hvx;
for (int i = 0; i < ctx->n_threads; i++) {
ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
}
// init worker pool
err = worker_pool_init(&ctx->worker_pool, n_hvx);
if (err != AEE_SUCCESS) {
FARF(ERROR, "Unable to create worker pool");
return err;
}
FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
return AEE_SUCCESS;
}
AEEResult htp_iface_stop(remote_handle64 handle) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
}
if (!ctx->queue) {
FARF(ERROR, "Queue not open");
return AEE_EBADSTATE;
}
// Close queue. dspqueue_close() will also wait for callbacks to finish.
int err = dspqueue_close(ctx->queue);
ctx->queue = NULL;
if (err != 0) {
FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
return err;
}
if (ctx->worker_pool) {
// Release worker pool
worker_pool_release(&ctx->worker_pool);
}
for (int i = 0; i < ctx->n_threads; i++) {
dma_queue_delete(ctx->dma[i]);
}
vtcm_free(ctx);
return AEE_SUCCESS;
}
static void htp_error_callback(dspqueue_t queue, int error, void * context) {
// No errors expected on the DSP.
FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
}
struct profile_data {
uint64_t usecs;
uint64_t cycles;
uint64_t pkts;
};
static inline void profile_start(struct profile_data * d) {
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = htp_get_cycles();
d->pkts = htp_get_pktcnt();
}
static inline void profile_stop(struct profile_data * d) {
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = htp_get_cycles() - d->cycles;
d->pkts = htp_get_pktcnt() - d->pkts;
}
static int send_htp_rsp(struct htp_context * c,
uint32_t op,
uint32_t status,
struct dspqueue_buffer * bufs,
size_t n_bufs,
struct profile_data * prof) {
// Prep response struct
struct htp_general_rsp rsp;
rsp.op = op;
rsp.status = status;
rsp.prof_usecs = prof->usecs;
rsp.prof_cycles = prof->cycles;
rsp.prof_pkts = prof->pkts;
int err = dspqueue_write(c->queue,
0, // Flags
n_bufs,
bufs, // Buffer references
sizeof(rsp),
(const uint8_t *) &rsp, // Message
DSPQUEUE_TIMEOUT_NONE);
if (err != 0) {
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
}
return err;
}
static void proc_matmul_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
size_t n_bufs) {
// Prep response buffer structs (needed for error responses, etc)
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.src1 = req->src1;
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.src1.data = (uint32_t) bufs[1].ptr;
octx.dst.data = (uint32_t) bufs[2].ptr;
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_matmul(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
}
static void proc_matmul_id_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
size_t n_bufs) {
// Prep response buffer structs (needed for error responses, etc)
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[3].fd = bufs[3].fd;
rsp_bufs[3].ptr = bufs[3].ptr;
rsp_bufs[3].size = bufs[3].size;
rsp_bufs[3].offset = bufs[3].offset;
rsp_bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.src1 = req->src1;
octx.src2 = req->src2;
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.src1.data = (uint32_t) bufs[1].ptr;
octx.src2.data = (uint32_t) bufs[2].ptr;
octx.dst.data = (uint32_t) bufs[3].ptr;
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_matmul_id(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
}
static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.src1 = req->src1;
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.src1.data = (uint32_t) bufs[1].ptr;
octx.dst.data = (uint32_t) bufs[2].ptr;
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_binary(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
}
static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[3].fd = bufs[3].fd;
rsp_bufs[3].ptr = bufs[3].ptr;
rsp_bufs[3].offset = bufs[3].offset;
rsp_bufs[3].size = bufs[3].size;
rsp_bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.src1 = req->src1;
octx.src2 = req->src2;
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.src1.data = (uint32_t) bufs[1].ptr;
octx.src2.data = (uint32_t) bufs[2].ptr;
octx.dst.data = (uint32_t) bufs[3].ptr;
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_binary(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
}
static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.dst.data = (uint32_t) bufs[1].ptr;
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_unary(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 2, &prof);
}
static void proc_activations_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
uint32_t n_bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
int write_idx = 1;
if (3 == n_bufs) {
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
write_idx = 2;
}
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[write_idx].fd = bufs[write_idx].fd;
rsp_bufs[write_idx].ptr = bufs[write_idx].ptr;
rsp_bufs[write_idx].offset = bufs[write_idx].offset;
rsp_bufs[write_idx].size = bufs[write_idx].size;
rsp_bufs[write_idx].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
if (3 == n_bufs) {
octx.src1 = req->src1;
}
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
if (3 == n_bufs) {
octx.src1.data = (uint32_t) bufs[1].ptr;
octx.dst.data = (uint32_t) bufs[2].ptr;
} else {
octx.dst.data = (uint32_t) bufs[1].ptr;
}
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
if (octx.op == HTP_OP_SOFTMAX) {
rsp_status = op_softmax(&octx);
} else {
rsp_status = op_activations(&octx);
}
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
}
static void proc_rope_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
uint32_t n_bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
int write_idx = 2;
if (4 == n_bufs) {
rsp_bufs[write_idx].fd = bufs[write_idx].fd;
rsp_bufs[write_idx].ptr = bufs[write_idx].ptr;
rsp_bufs[write_idx].offset = bufs[write_idx].offset;
rsp_bufs[write_idx].size = bufs[write_idx].size;
rsp_bufs[write_idx].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
write_idx++;
}
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[write_idx].fd = bufs[write_idx].fd;
rsp_bufs[write_idx].ptr = bufs[write_idx].ptr;
rsp_bufs[write_idx].offset = bufs[write_idx].offset;
rsp_bufs[write_idx].size = bufs[write_idx].size;
rsp_bufs[write_idx].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
octx.ctx = ctx;
octx.src0 = req->src0;
octx.src1 = req->src1;
if (4 == n_bufs) {
octx.src2 = req->src2;
}
octx.dst = req->dst;
octx.flags = req->flags;
octx.op = req->op;
memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
// Update data pointers
octx.src0.data = (uint32_t) bufs[0].ptr;
octx.src1.data = (uint32_t) bufs[1].ptr;
if (4 == n_bufs) {
octx.src2.data = (uint32_t) bufs[2].ptr;
octx.dst.data = (uint32_t) bufs[3].ptr;
} else {
octx.dst.data = (uint32_t) bufs[2].ptr;
}
octx.n_threads = ctx->n_threads;
struct profile_data prof;
profile_start(&prof);
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
rsp_status = op_rope(&octx);
vtcm_release(ctx);
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
}
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
struct htp_context * ctx = (struct htp_context *) context;
// Repeatedly read packets from the queue until it's empty. We don't
// necessarily get a separate callback for each packet, and new packets
// may arrive while we're processing the previous one. This ensures we
// keep the DSP busy as much as possible and avoid waiting for the CPU.
while (1) {
struct htp_general_req req;
uint32_t req_size;
struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
uint32_t n_bufs;
uint32_t flags;
// Read packet from queue
int err = dspqueue_read_noblock(queue, &flags,
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
&n_bufs, // Number of buffer references
bufs, // Buffer references
sizeof(req), // Max message length
&req_size, // Message length
(uint8_t *) &req); // Message
if (err == AEE_EWOULDBLOCK) {
// Consumed all packets available for now
return;
}
if (err != 0) {
FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
return;
}
if (req_size != sizeof(req)) {
FARF(ERROR, "Invalid request size");
continue;
}
if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
// Host wants early notification
dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
}
// Process packet based on its message type
switch (req.op) {
case HTP_OP_MUL_MAT:
if (n_bufs != 3) {
FARF(ERROR, "Bad matmul-req buffer list");
continue;
}
proc_matmul_req(ctx, &req, bufs, n_bufs);
break;
case HTP_OP_MUL_MAT_ID:
if (n_bufs != 4) {
FARF(ERROR, "Bad matmul-id-req buffer list");
continue;
}
proc_matmul_id_req(ctx, &req, bufs, n_bufs);
break;
case HTP_OP_MUL:
case HTP_OP_ADD:
case HTP_OP_SUB:
if (n_bufs != 3) {
FARF(ERROR, "Bad binary-req buffer list");
continue;
}
proc_binary_req(ctx, &req, bufs);
break;
case HTP_OP_RMS_NORM:
if (n_bufs != 2) {
FARF(ERROR, "Bad unary-req buffer list");
continue;
}
proc_unary_req(ctx, &req, bufs);
break;
case HTP_OP_UNARY_SILU:
if (n_bufs != 2) {
FARF(ERROR, "Bad act-req buffer list");
continue;
}
proc_activations_req(ctx, &req, bufs, n_bufs);
break;
case HTP_OP_GLU_SWIGLU:
case HTP_OP_SOFTMAX:
if ((n_bufs != 2) && (n_bufs != 3)) {
FARF(ERROR, "Bad act-req buffer list");
continue;
}
proc_activations_req(ctx, &req, bufs, n_bufs);
break;
case HTP_OP_ADD_ID:
if (n_bufs != 4) {
FARF(ERROR, "Bad add-id-req buffer list");
continue;
}
proc_add_id_req(ctx, &req, bufs);
break;
case HTP_OP_ROPE:
if ((n_bufs != 3) && (n_bufs != 4)) {
FARF(ERROR, "Bad rope-req buffer list");
continue;
}
proc_rope_req(ctx, &req, bufs, n_bufs);
break;
default:
FARF(ERROR, "Unknown Op %u", req.op);
break;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,116 @@
#ifndef OPS_UTILS_H
#define OPS_UTILS_H
#include "htp-msg.h"
#ifndef MAX
# define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
#ifndef MIN
# define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
static inline uint64_t htp_get_cycles() {
uint64_t cycles = 0;
asm volatile(" %0 = c15:14\n" : "=r"(cycles));
return cycles;
}
static inline uint64_t htp_get_pktcnt() {
uint64_t pktcnt;
asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
return pktcnt;
}
static inline int32_t htp_is_aligned(void * addr, uint32_t align) {
return ((size_t) addr & (align - 1)) == 0;
}
static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
return m * ((n + m - 1) / m);
}
static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
}
static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
uint32_t left_off = (size_t) addr & (chunk_size - 1);
uint32_t right_off = left_off + n;
return right_off <= chunk_size;
}
static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < 16; i++) {
p += sprintf(p, "%d, ", x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += sprintf(p, "%d, ", x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += sprintf(p, "%d, ", (int) x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += sprintf(p, "%.6f, ", (float) x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += sprintf(p, "%.6f, ", x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_f32(char * pref, const float * x, uint32_t n) {
uint32_t n0 = n / 16;
uint32_t n1 = n % 16;
uint32_t i = 0;
for (; i < n0; i++) {
htp_dump_fp32_line(pref, x + (16 * i), 16);
}
if (n1) {
htp_dump_fp32_line(pref, x + (16 * i), n1);
}
}
static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
uint32_t n0 = n / 16;
uint32_t n1 = n % 16;
uint32_t i = 0;
for (; i < n0; i++) {
htp_dump_fp16_line(pref, x + (16 * i), 16);
}
if (n1) {
htp_dump_fp16_line(pref, x + (16 * i), n1);
}
}
#endif /* OPS_UTILS_H */

View File

@ -0,0 +1,418 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <qurt_thread.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
#define htp_rope_preamble \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct rope_th_ctx {
int32_t n_dims;
int32_t mode;
int32_t n_ctx_orig;
int32_t sections[4];
float freq_base;
float freq_scale;
float ext_factor;
float attn_factor;
float beta_fast;
float beta_slow;
float theta_scale;
float corr_dims[2];
struct htp_ops_context * octx;
};
static float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
return (1 - MIN(1, MAX(0, y)));
}
static void rope_cache_init(const float theta_base,
float freq_scale,
const float * freq_factors,
float * corr_dims,
uint32_t ne0,
float ext_factor,
float mscale,
float * cache,
float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
float theta = theta_base;
for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
float theta_extrap = theta / ff;
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
float theta2 = theta_interp;
if (ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
}
cache[i0 + 0] = cosf(theta2) * mscale;
cache[i0 + 1] = sinf(theta2) * mscale;
theta *= theta_scale;
}
}
#define M_PI 3.1415926535897932384626433
static void rope_corr_dims(int n_dims,
int n_ctx_orig,
float freq_base,
float beta_fast,
float beta_slow,
float * dims) {
float start = floorf(n_dims * logf(n_ctx_orig / (beta_fast * 2 * (float) M_PI)) / (2 * logf(freq_base)));
float end = ceilf(n_dims * logf(n_ctx_orig / (beta_slow * 2 * (float) M_PI)) / (2 * logf(freq_base)));
dims[0] = MAX(0, start);
dims[1] = MIN(n_dims - 1, end);
}
static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
const int32_t * op_params = &octx->op_params[0];
rope_ctx->n_dims = ((const int32_t *) op_params)[1];
rope_ctx->mode = ((const int32_t *) op_params)[2];
rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
rope_ctx->beta_slow, rope_ctx->corr_dims);
rope_ctx->octx = octx;
FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
}
static void hvx_calc_rope_f32(const float * restrict src0,
float * restrict dst,
const int num_elems,
const float * restrict theta_cache) {
// for (int i = 0; i < num_elems; i += 2) {
//const float cos_theta = theta_cache[i + 0];
//const float sin_theta = theta_cache[i + 1];
//const float x0 = src[0];
//const float x1 = src[1];
//dst[0] = x0*cos_theta - x1*sin_theta;
//dst[1] = x0*sin_theta + x1*cos_theta;
//src += 2;
//dst += 2;
// }
const uint8_t * restrict src0_curr = (const uint8_t *) src0;
const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
uint8_t * restrict dst_curr = (uint8_t *) dst;
int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v0 = *(HVX_Vector *) src0_curr;
HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
HVX_Vector v2 = *(HVX_Vector *) theta_curr;
HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); // vx0_x1[0] = x0, vx0_x1[1] = x1
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
*(HVX_Vector *) dst_curr = Q6_V_lo_W(vstore);
*(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
src0_curr += 2 * VLEN;
theta_curr += 2 * VLEN;
dst_curr += 2 * VLEN;
}
}
static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
const uint32_t ir0,
const uint32_t ir1,
int nth,
int ith,
int opt_path) {
struct htp_ops_context * octx = rope_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
htp_rope_preamble;
const int32_t * pos = (const int32_t *) src1->data;
float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
const float * freq_factors = NULL;
if (src2 != NULL) {
freq_factors = (const float *) src2->data;
}
int ir = 0;
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
const int32_t p = pos[i2];
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads
if (ir++ < ir0) {
continue;
}
if (ir > ir1) {
break;
}
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
const float * src_loc = src;
float * dst_data_loc = dst_data;
if (1 == opt_path) {
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
} else {
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
const float cos_theta = wp0[i0 + 0];
const float sin_theta = wp0[i0 + 1];
const float x0 = src_loc[0];
const float x1 = src_loc[1];
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
src_loc += 2;
dst_data_loc += 2;
}
}
for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
dst_data_loc[0] = src_loc[0];
dst_data_loc[1] = src_loc[1];
src_loc += 2;
dst_data_loc += 2;
}
}
}
}
}
static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
struct htp_ops_context * octx = rope_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
htp_rope_preamble;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
(0 == htp_is_aligned((void *) dst->data, VLEN))) {
FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
is_aligned = 0;
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
rope_job_f32_per_thread(rope_ctx, n, i);
}
static int execute_op_rope_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
worker_callback_t op_func;
const char * op_type = NULL;
struct rope_th_ctx rope_ctx;
switch (octx->op) {
case HTP_OP_ROPE:
op_func = rope_job_dispatcher_f32;
op_type = "rope-f32";
init_rope_ctx(&rope_ctx, octx);
break;
default:
FARF(ERROR, "Unsupported Op %u\n", octx->op);
return HTP_STATUS_NO_SUPPORT;
}
const uint32_t n_threads = octx->n_threads;
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src0_row_size;
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
if (src2->ne[0]) {
FARF(HIGH,
"%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
"dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
} else {
FARF(HIGH,
"%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
}
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
}
return err;
}
int op_rope(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_rope_f32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}

View File

@ -0,0 +1,402 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <qurt_thread.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
#define htp_softmax_preamble3 \
const uint32_t ne00 = src0->ne[0]; \
const uint32_t ne01 = src0->ne[1]; \
const uint32_t ne02 = src0->ne[2]; \
const uint32_t ne03 = src0->ne[3]; \
\
const uint32_t nb00 = src0->nb[0]; \
const uint32_t nb01 = src0->nb[1]; \
const uint32_t nb02 = src0->nb[2]; \
const uint32_t nb03 = src0->nb[3]; \
\
const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \
const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \
const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \
const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \
\
const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \
const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \
const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \
const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
struct softmax_th_ctx {
bool use_f16;
bool use_src1;
uint32_t n_head;
uint32_t n_head_log2;
float scale;
float max_bias;
float m0;
float m1;
struct htp_ops_context * octx;
};
static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));
memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
softmax_ctx->n_head = src0->ne[2];
softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);
softmax_ctx->use_src1 = (src1->ne[0] != 0);
softmax_ctx->use_f16 = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
softmax_ctx->octx = octx;
}
static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
const int num_elems,
float scale,
const uint8_t * restrict mask,
float slope) {
const uint8_t * restrict src_curr = src;
uint8_t * restrict dst_curr = dst;
const uint8_t * restrict mask_curr = mask;
HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
int step_of_1 = num_elems >> 5;
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v1 = *(HVX_Vector *) src_curr;
HVX_Vector v3 = *(HVX_Vector *) mask_curr;
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v3, slope_vec);
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, v4);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v5);
src_curr += VLEN;
dst_curr += VLEN;
mask_curr += VLEN;
}
}
static void hvx_fast_softmax_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
uint8_t * restrict pad,
const int num_elems) {
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_pad = (HVX_Vector *) pad;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]);
HVX_Vector zero_v = Q6_V_vzero();
HVX_Vector one_v = hvx_vec_splat_fp32(1.0);
int step_of_1 = num_elems >> 5;
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v1 = v_src[i];
max_vec = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
}
HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);
max_vec = hvx_vec_repl4(v);
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);
HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2));
sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);
v_pad[i] = v3;
}
v = hvx_vec_qf32_reduce_sum(sum_vec);
sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
HVX_Vector v4 = hvx_vec_inverse_fp32(sum_vec);
HVX_Vector scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v1 = v_pad[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
}
}
static float hvx_softmax_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
uint8_t * restrict spad,
const int num_elems,
const float max) {
hvx_sub_scalar_f32(src, max, spad, num_elems);
hvx_exp_f32(spad, dst, num_elems, false);
float sum = hvx_self_sum_f32(dst, num_elems);
return sum;
}
static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
struct htp_ops_context * octx = softmax_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
const struct htp_tensor * dst = &octx->dst;
htp_softmax_preamble3;
uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
uint8_t * dst_spad_data = octx->dst_spad.data + (ith * nb1);
float * wp0 = (float *) src0_spad_data;
float * wp1 = (float *) src1_spad_data;
float * wp2 = (float *) dst_spad_data;
for (uint32_t i03 = 0; i03 < ne03; i03++) {
for (uint32_t i02 = 0; i02 < ne02; i02++) {
for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
const uint32_t i11 = i01;
const uint32_t i12 = i02 % ne12;
const uint32_t i13 = i03 % ne13;
// ALiBi
const uint32_t h = i02; // head
const float slope = (softmax_ctx->max_bias > 0.0f) ?
h < softmax_ctx->n_head_log2 ?
powf(softmax_ctx->m0, h + 1) :
powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
1.0f;
float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
// broadcast the mask across rows
__fp16 * mp_f16 = (softmax_ctx->use_src1) ?
(__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
float * mp_f32 = (softmax_ctx->use_src1) ?
(float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
NULL;
if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
(const uint8_t *) mp_f32, slope);
} else {
hvx_scale_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale);
if (mp_f32) {
if (softmax_ctx->use_f16) {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * (float) mp_f16[i];
}
} else {
for (int i = 0; i < ne00; ++i) {
wp0[i] += slope * mp_f32[i];
}
}
}
}
if (1 == opt_path) {
hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
} else {
float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
sum = sum > 0.0 ? (1.0 / sum) : 1;
hvx_scale_f32((const uint8_t *) wp2, (uint8_t *) dp, ne00, sum);
}
}
}
}
}
static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
struct htp_ops_context * octx = softmax_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
htp_softmax_preamble3;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
softmax_htp_f32(nth, ith, softmax_ctx, opt_path);
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
softmax_job_f32_per_thread(p_softmax_ctx, n, i);
}
static int execute_op_softmax_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
const struct htp_tensor * src1 = &octx->src1;
struct htp_tensor * dst = &octx->dst;
worker_callback_t op_func;
const char * op_type = NULL;
struct softmax_th_ctx softmax_ctx;
switch (octx->op) {
case HTP_OP_SOFTMAX:
op_func = softmax_job_dispatcher_f32;
op_type = "softmax-f32";
init_softmax_ctx(&softmax_ctx, octx);
break;
default:
FARF(ERROR, "Unsupported Op %u\n", octx->op);
return HTP_STATUS_NO_SUPPORT;
}
const uint32_t n_threads = octx->n_threads;
const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src0_row_size;
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
if (src1->ne[0]) {
FARF(HIGH,
"%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
octx->dst_spad.size);
} else {
FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
}
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;
uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
}
return err;
}
int op_softmax(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_softmax_f32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}

View File

@ -0,0 +1,255 @@
#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wunused-function"
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include <HAP_farf.h>
#include <HAP_mem.h>
#include <HAP_perf.h>
#include <HAP_ps.h>
#include <hexagon_protos.h>
#include <hexagon_types.h>
#include <math.h>
#include <qurt_thread.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-dma.h"
#include "htp-msg.h"
#include "htp-ops.h"
#include "hvx-utils.h"
#include "ops-utils.h"
#define htp_unary_preamble \
const uint32_t ne00 = src->ne[0]; \
const uint32_t ne01 = src->ne[1]; \
const uint32_t ne02 = src->ne[2]; \
const uint32_t ne03 = src->ne[3]; \
\
const uint32_t ne0 = dst->ne[0]; \
const uint32_t ne1 = dst->ne[1]; \
const uint32_t ne2 = dst->ne[2]; \
const uint32_t ne3 = dst->ne[3]; \
\
const uint32_t nb00 = src->nb[0]; \
const uint32_t nb01 = src->nb[1]; \
const uint32_t nb02 = src->nb[2]; \
const uint32_t nb03 = src->nb[3]; \
\
const uint32_t nb0 = dst->nb[0]; \
const uint32_t nb1 = dst->nb[1]; \
const uint32_t nb2 = dst->nb[2]; \
const uint32_t nb3 = dst->nb[3];
static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
uint8_t * restrict dst,
uint8_t * restrict pad,
const int num_elems,
float epsilon) {
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon);
int step_of_1 = num_elems >> 5;
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
HVX_Vector reduced_sum = hvx_vec_qf32_reduce_sum(sum_v);
sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
HVX_Vector t_v = hvx_vec_splat_fp32((float) num_elems);
HVX_Vector denom_v = hvx_vec_inverse_fp32(t_v);
HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
#pragma unroll(4)
for (int i = 0; i < step_of_1; i++) {
HVX_Vector v1 = v_src[i];
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
}
}
static void rms_norm_htp_f32(const float * restrict src,
float * restrict dst,
uint8_t * restrict spad,
const uint32_t num_rows,
const uint32_t row_elems,
const size_t row_size,
int32_t * op_params,
int opt_path) {
float epsilon = 0.f;
memcpy(&epsilon, op_params, sizeof(float));
for (uint32_t ir = 0; ir < num_rows; ir++) {
const float * restrict src_local = src + (ir * row_elems);
float * restrict dst_local = dst + (ir * row_elems);
if (ir + 1 < num_rows) {
htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
}
if (1 == opt_path) {
hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
} else {
float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
const float mean = sum / row_elems;
const float scale = 1.0f / sqrtf(mean + epsilon);
hvx_scale_f32((const uint8_t *) src_local, (uint8_t *) dst_local, row_elems, scale);
}
}
}
static void unary_job_f32_per_thread(const struct htp_tensor * src,
struct htp_tensor * dst,
uint8_t * spad,
int htp_op,
int32_t * op_params,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
htp_unary_preamble;
const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows
const uint32_t src0_start_row = src0_nrows_per_thread * ith;
const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
// no work for this thread
if (src0_start_row >= src0_end_row) {
return;
}
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
int is_aligned = 1;
int opt_path = 0;
if ((0 == htp_is_aligned((void *) src->data, VLEN)) || (0 == htp_is_aligned((void *) dst->data, VLEN))) {
is_aligned = 0;
FARF(HIGH, "unary-f32: unaligned addresses in unary op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}
const uint8_t * restrict data_src = (const uint8_t *) src->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
float * restrict dst_th = (float *) (data_dst + (src0_start_row * dst_row_size));
uint8_t * restrict spad_th = (uint8_t *) spad + (ith * nb01);
switch (htp_op) {
case HTP_OP_RMS_NORM:
rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
break;
default:
break;
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
octx->src0_nrows_per_thread);
}
static int execute_op_unary_f32(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
const struct htp_tensor * src0 = &octx->src0;
struct htp_tensor * dst = &octx->dst;
worker_callback_t unary_op_func;
const char * op_type = NULL;
switch (octx->op) {
case HTP_OP_RMS_NORM:
unary_op_func = unary_job_dispatcher_f32;
op_type = "rmsnorm-f32";
break;
default:
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
return HTP_STATUS_NO_SUPPORT;
}
const int n_threads = octx->n_threads;
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
const size_t src0_row_size = src0->nb[1];
const size_t dst_row_size = dst->nb[1];
// VTCM scratchpads for all tensors
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}
octx->src0_spad.data = octx->ctx->vtcm_base;
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
}
return err;
}
int op_unary(struct htp_ops_context * octx) {
int err = HTP_STATUS_OK;
switch (octx->src0.type) {
case HTP_TYPE_F32:
err = execute_op_unary_f32(octx);
break;
default:
err = HTP_STATUS_NO_SUPPORT;
break;
}
return err;
}

View File

@ -0,0 +1,297 @@
#include "worker-pool.h"
#include <qurt.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HTP_DEBUG
# define FARF_HIGH 1
#endif
#include "HAP_farf.h"
#define WORKER_THREAD_STACK_SZ (2 * 16384)
#define LOWEST_USABLE_QURT_PRIO (254)
struct worker_pool_s;
// internal structure kept in thread-local storage per instance of worker pool
typedef struct {
struct worker_pool_s * pool;
unsigned int id;
} worker_context_t;
// internal structure kept in thread-local storage per instance of worker pool
typedef struct worker_pool_s {
worker_pool_job_t job[MAX_NUM_WORKERS]; // list of job descriptors
qurt_thread_t thread[MAX_NUM_WORKERS]; // thread ID's of the workers
worker_context_t context[MAX_NUM_WORKERS]; // worker contexts
void * stack[MAX_NUM_WORKERS]; // thread stack pointers
unsigned int n_threads; // number of workers in this pool
atomic_uint seqn; // seqno used to detect new jobs
atomic_uint next_job; // next job index
atomic_uint n_pending; // number of pending jobs
atomic_uint n_jobs; // number of current jobs
atomic_bool killed; // threads need to exit
} worker_pool_t;
static void worker_pool_main(void * context) {
worker_context_t * me = (worker_context_t *) context;
worker_pool_t * pool = me->pool;
FARF(HIGH, "worker-pool: thread %u started", me->id);
unsigned int prev_seqn = 0;
while (!atomic_load(&pool->killed)) {
unsigned int seqn = atomic_load(&pool->seqn);
if (seqn == prev_seqn) {
// Nothing to do
qurt_futex_wait(&pool->seqn, prev_seqn);
continue;
}
// New job
prev_seqn = seqn;
unsigned int n = atomic_load(&pool->n_jobs);
unsigned int i = atomic_fetch_add(&pool->next_job, 1);
if (i >= n) {
// Spurios wakeup
continue;
}
pool->job[i].func(n, i, pool->job[i].data);
atomic_fetch_sub(&pool->n_pending, 1);
}
FARF(HIGH, "worker-pool: thread %u stopped", me->id);
}
AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context, uint32_t n_threads, uint32_t stack_size) {
int err = 0;
if (NULL == context) {
FARF(ERROR, "NULL context passed to worker_pool_init().");
return AEE_EBADPARM;
}
// Allocations
int size = (stack_size * n_threads) + (sizeof(worker_pool_t));
unsigned char * mem_blob = (unsigned char *) malloc(size);
if (!mem_blob) {
FARF(ERROR, "Could not allocate memory for worker pool!!");
return AEE_ENOMEMORY;
}
worker_pool_t * me = (worker_pool_t *) (mem_blob + stack_size * n_threads);
// name for the first worker, useful in debugging threads
char name[19];
snprintf(name, 12, "0x%8x:", (int) me);
strcat(name, "worker0");
me->n_threads = n_threads;
// initializations
for (unsigned int i = 0; i < me->n_threads; i++) {
me->stack[i] = NULL;
me->thread[i] = 0;
me->context[i].id = i;
me->context[i].pool = me;
}
// initialize job queue
me->n_pending = 0;
me->n_jobs = 0;
me->next_job = 0;
me->seqn = 0;
me->killed = 0;
// launch the workers
qurt_thread_attr_t attr;
qurt_thread_attr_init(&attr);
for (unsigned int i = 0; i < me->n_threads; i++) {
// set up stack
me->stack[i] = mem_blob;
mem_blob += stack_size;
qurt_thread_attr_set_stack_addr(&attr, me->stack[i]);
qurt_thread_attr_set_stack_size(&attr, stack_size);
// set up name
qurt_thread_attr_set_name(&attr, name);
name[17] = (name[17] + 1);
// name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway)
if (name[17] > '9') {
name[17] = '0';
}
// set up priority - by default, match the creating thread's prio
int prio = qurt_thread_get_priority(qurt_thread_get_id());
if (prio < 1) {
prio = 1;
}
if (prio > LOWEST_USABLE_QURT_PRIO) {
prio = LOWEST_USABLE_QURT_PRIO;
}
qurt_thread_attr_set_priority(&attr, prio);
// launch
err = qurt_thread_create(&me->thread[i], &attr, worker_pool_main, (void *) &me->context[i]);
if (err) {
FARF(ERROR, "Could not launch worker threads!");
worker_pool_release((worker_pool_context_t *) &me);
return AEE_EQURTTHREADCREATE;
}
}
*context = (worker_pool_context_t *) me;
return AEE_SUCCESS;
}
AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads) {
return worker_pool_init_with_stack_size(context, n_threads, WORKER_THREAD_STACK_SZ);
}
// clean up worker pool
void worker_pool_release(worker_pool_context_t * context) {
worker_pool_t * me = (worker_pool_t *) *context;
// if no worker pool exists, return error.
if (NULL == me) {
return;
}
atomic_store(&me->killed, 1);
atomic_fetch_add(&me->seqn, 1);
qurt_futex_wake(&me->seqn, me->n_threads);
// de-initializations
for (unsigned int i = 0; i < me->n_threads; i++) {
if (me->thread[i]) {
int status;
(void) qurt_thread_join(me->thread[i], &status);
}
}
// free allocated memory (were allocated as a single buffer starting at stack[0])
if (me->stack[0]) {
free(me->stack[0]);
}
*context = NULL;
}
// run jobs
AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n) {
worker_pool_t * me = (worker_pool_t *) context;
if (NULL == me) {
FARF(ERROR, "worker-pool: invalid context");
return AEE_EBADPARM;
}
if (n > me->n_threads) {
FARF(ERROR, "worker-pool: invalid number of jobs %u for n-threads %u", n, me->n_threads);
return AEE_EBADPARM;
}
memcpy(me->job, job, sizeof(worker_pool_job_t) * n);
if (n > 1) {
atomic_store(&me->next_job, 1);
atomic_store(&me->n_jobs, n);
atomic_store(&me->n_pending, n - 1);
// wake up workers
atomic_fetch_add(&me->seqn, 1);
qurt_futex_wake(&me->seqn, n - 1);
}
// main thread runs job #0
me->job[0].func(n, 0, me->job[0].data);
if (n > 1) {
while (atomic_load(&me->n_pending))
;
}
return 0;
}
// run func
AEEResult worker_pool_run_func(worker_pool_context_t context, worker_callback_t func, void * data, unsigned int n) {
worker_pool_job_t job[n];
for (unsigned int i = 0; i < n; i++) {
job[i].func = func;
job[i].data = data;
}
return worker_pool_run_jobs(context, job, n);
}
AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio) {
worker_pool_t * me = (worker_pool_t *) context;
// if no worker pool exists, return error.
if (!me) {
return AEE_ENOMORE;
}
int result = AEE_SUCCESS;
if (prio < 1) {
prio = 1;
}
if (prio > LOWEST_USABLE_QURT_PRIO) {
prio = LOWEST_USABLE_QURT_PRIO;
}
for (unsigned int i = 0; i < me->n_threads; i++) {
int res = qurt_thread_set_priority(me->thread[i], (unsigned short) prio);
if (0 != res) {
result = AEE_EBADPARM;
FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res);
}
}
return result;
}
AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids) {
worker_pool_t * me = (worker_pool_t *) context;
if (!me) {
FARF(ERROR, "worker-pool: invalid context");
return AEE_EBADPARM;
;
}
for (int i = 0; i < me->n_threads; i++) {
tids[i] = me->thread[i];
}
return AEE_SUCCESS;
}
AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio) {
worker_pool_t * me = (worker_pool_t *) context;
if (!me) {
FARF(ERROR, "worker-pool: invalid context");
return AEE_EBADPARM;
}
int priority = qurt_thread_get_priority(me->thread[0]);
if (priority > 0) {
*prio = priority;
return 0;
} else {
*prio = 0;
return AEE_EBADSTATE;
}
}

View File

@ -0,0 +1,57 @@
#ifndef HTP_WORKER_POOL_H
#define HTP_WORKER_POOL_H
// MACRO enables function to be visible in shared-library case.
#define WORKERPOOL_API __attribute__((visibility("default")))
#include <AEEStdDef.h>
#include <AEEStdErr.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/// signature of callbacks to be invoked by worker threads
typedef void (*worker_callback_t)(unsigned int n, unsigned int i, void *);
/// Typedef of worker_pool context
typedef void * worker_pool_context_t;
/// descriptor for requested callback
typedef struct {
worker_callback_t func;
void * data;
} worker_pool_job_t;
/// Maximum supported number of worker threads.
#define MAX_NUM_WORKERS 10
// Initialize worker pool.
WORKERPOOL_API AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads);
// Initialize worker pool with custom stack size
WORKERPOOL_API AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context,
uint32_t n_threads,
uint32_t stack_size);
// Kill worker threads and release worker pool resources
WORKERPOOL_API void worker_pool_release(worker_pool_context_t * context);
// Run jobs with the worker pool.
WORKERPOOL_API AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n);
WORKERPOOL_API AEEResult worker_pool_run_func(worker_pool_context_t context,
worker_callback_t func,
void * data,
unsigned int n);
WORKERPOOL_API AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio);
WORKERPOOL_API AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio);
WORKERPOOL_API AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids);
#ifdef __cplusplus
}
#endif
#endif // #ifndef HTP_WORKER_POOL_H

View File

@ -647,6 +647,36 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
}
GGML_API bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
const int * node_idxs,
int count,
const enum ggml_op * ops,
const int * outputs,
int num_outputs);
// Returns true if the subgraph formed by {node_idxs} can be fused
// checks whethers all nodes which are not part of outputs can be elided
// by checking if their num_uses are confined to the subgraph
static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
int node_idx,
int count,
const enum ggml_op * ops,
const int * outputs,
int num_outputs) {
GGML_ASSERT(count < 32);
if (node_idx + count > cgraph->n_nodes) {
return false;
}
int idxs[32];
for (int i = 0; i < count; ++i) {
idxs[i] = node_idx + i;
}
return ggml_can_fuse_subgraph_ext(cgraph, idxs, count, ops, outputs, num_outputs);
}
#ifdef __cplusplus
}
#endif
@ -660,6 +690,13 @@ inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::
return ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size());
}
inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
int start_idx,
std::initializer_list<enum ggml_op> ops,
std::initializer_list<int> outputs = {}) {
return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
}
// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);

View File

@ -15,13 +15,12 @@
#include <CL/cl.h>
#include <inttypes.h>
#include <string.h>
#include <cstddef>
#include <cstdint>
#include <atomic>
#include <fstream>
#include <limits>
#include <vector>
#include <string>
#include <cmath>
@ -533,25 +532,17 @@ struct ggml_backend_opencl_context {
}
// Dump a csv
float total_kernel_time = 0;
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
for (const ProfilingInfo & info : profiling_info) {
total_kernel_time += info.cmd_duration_ns/1.e6f;
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
info.op_name.c_str(), info.kernel_name.c_str(),
info.cmd_queued_duration_ns/1.e6f,
info.cmd_submit_duration_ns/1.e6f,
info.cmd_duration_ns/1.e6f,
info.cmd_complete_duration_ns/1.e6f,
info.cmd_total_duration_ns/1.e6f,
info.global_size[0], info.global_size[1], info.global_size[2],
info.local_size[0], info.local_size[1], info.local_size[2],
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
}
fclose(fperf);
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
// Dump a simple chrome trace
FILE* ftrace = fopen("cl_trace.json", "w");
if (!ftrace) {
@ -561,14 +552,14 @@ struct ggml_backend_opencl_context {
fprintf(ftrace, "[\n");
for (const ProfilingInfo & info : profiling_info) {
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Host\"},\n",
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
info.kernel_name.c_str(), info.cmd_queued/1000);
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Host\"},\n",
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
info.kernel_name.c_str(), info.cmd_submit/1000);
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Device\"},\n",
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
info.kernel_name.c_str(), info.cmd_start/1000);
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %llu, \"pid\": \"\", \"tid\": \"Device\"},\n",
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
info.kernel_name.c_str(), info.cmd_end/1000);
}
fclose(ftrace);
@ -7652,6 +7643,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
const cl_ulong nb21 = src2->nb[1];
const cl_ulong nb20 = src2->nb[0];
UNUSED(nb20);
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];

View File

@ -37,5 +37,7 @@
#include "softmax.hpp"
#include "tsembd.hpp"
#include "wkv.hpp"
#include "pad_reflect_1d.hpp"
#endif // GGML_SYCL_BACKEND_HPP

View File

@ -30,6 +30,9 @@
#include <regex>
#include <sycl/sycl.hpp>
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
#endif
#include <sycl/half_type.hpp>
#include "ggml-sycl.h"
@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0;
int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
int g_ggml_sycl_use_async_mem_op = 0;
static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
@ -237,7 +241,20 @@ static void ggml_check_sycl() try {
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/
// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
// other places.
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
if (g_ggml_sycl_use_async_mem_op) {
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
g_ggml_sycl_use_async_mem_op = 0;
break;
}
}
}
#endif
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
@ -3031,19 +3048,51 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
}
}
// Helper functions to unify device memory allocation for both async and sync paths
static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) {
bool use_async = g_ggml_sycl_use_async_mem_op;
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
if (use_async) {
return syclex::async_malloc(*stream, sycl::usm::alloc::device, size);
}
#else
// If async allocation extension is not available, use_async should always be false.
GGML_ASSERT(!use_async);
#endif
return sycl::malloc(size, *stream, sycl::usm::alloc::device);
}
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
bool use_async = g_ggml_sycl_use_async_mem_op;
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
if (use_async) {
syclex::async_free(*stream, ptr);
return;
}
#else
// If async allocation extension is not available, use_async should always be false.
GGML_ASSERT(!use_async);
#endif
sycl::free(ptr, *stream);
}
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
.wait()));
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
int offset_blks = offset / sizeof(block_q4_0);
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
stream->parallel_for(
auto reorder_event = stream->parallel_for(
size / sizeof(block_q4_0),
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0* x = (const block_q4_0*)tmp_buf;
@ -3054,9 +3103,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
}
*(d_ptr + ib) = x[ib].d;
}).wait_and_throw();
sycl::free(tmp_buf, *stream);
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
@ -3065,14 +3116,19 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
const int nblocks = size / sizeof(block_q4_K);
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
auto * qs_ptr = data_device;
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
stream->parallel_for(nblocks, [=](auto i) {
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q4_K * x = (const block_q4_K *) tmp_buf;
const int ib = i;
@ -3085,9 +3141,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
}
dm_ptr[ib] = x[ib].dm;
}).wait_and_throw();
sycl::free(tmp_buf, *stream);
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
@ -3096,17 +3154,20 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
const int nblocks = size / sizeof(block_q6_K);
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
auto * ql_ptr = data_device;
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
stream
->parallel_for(nblocks,
[=](auto i) {
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
const block_q6_K * x = (const block_q6_K *) tmp_buf;
const int ib = i;
@ -3128,10 +3189,11 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
}
dm_ptr[ib] = x[ib].d;
})
.wait_and_throw();
sycl::free(tmp_buf, *stream);
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
sycl_ext_free(stream, tmp_buf);
}
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
@ -3744,6 +3806,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_CONCAT:
ggml_sycl_op_concat(ctx, dst);
break;
case GGML_OP_PAD_REFLECT_1D:
ggml_sycl_op_pad_reflect_1d(ctx,dst);
break;
case GGML_OP_UPSCALE:
ggml_sycl_upscale(ctx, dst);
break;
@ -4053,6 +4118,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
ggml_op_name(node_op));
return false;
case GGML_OP_MUL_MAT:
// We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
// as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
// in reordering.
if (!g_ggml_sycl_use_async_mem_op) {
GGML_LOG_INFO(
"%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
"oneAPI async memory allocation extension "
"%s\n",
__func__, ggml_op_name(node_op));
return false;
}
}
}
return true;
@ -4455,6 +4532,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_DIV:
case GGML_OP_REPEAT:
return true;
case GGML_OP_PAD_REFLECT_1D:
return ggml_is_contiguous(op->src[0]) && op-> type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:

View File

@ -0,0 +1,72 @@
#include "pad_reflect_1d.hpp"
void pad_reflect_1d_f32(const float* src,float* dst,
const int64_t ne0, const int64_t ne02, const int p0, const int p1,
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
const sycl::nd_item<3> &item_ct1){
const int i0 = item_ct1.get_group(0) * SYCL_CONCAT_BLOCK_SIZE + item_ct1.get_local_id(0);
const int i1 = item_ct1.get_group(1);
const int g2 = item_ct1.get_group(2);
const int i2 = g2 % ne02;
const int i3 = g2 / ne02;
if (i0 >= p0 + ne0 + p1) return;
int t = i0 - p0;
int period = 2 * ne0 -2;
int m = t % period;
m += (m < 0) * period;
int center = ne0 -1;
int srci0 = center - abs(center - m);
int offest_src = i3*nb3 + i2*nb2 + i1*nb1 + srci0*nb0;
int offest_dst = i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00;
dst[offest_dst] = src[offest_src];
}
void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context& ctx, ggml_tensor* dst){
const ggml_tensor * src0 = dst->src[0];
queue_ptr stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int32_t * opts = (const int32_t *) dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
const int64_t ne0 = src0->ne[0];
const int64_t ne00 = dst->ne[0];
const int64_t ne01 = dst->ne[1];
const int64_t ne02 = dst->ne[2];
const int64_t ne03 = dst->ne[3];
const int64_t nb00 = dst->nb[0];
const int64_t nb01 = dst->nb[1];
const int64_t nb02 = dst->nb[2];
const int64_t nb03 = dst->nb[3];
const int64_t nb0 = src0->nb[0];
const int64_t nb1 = src0->nb[1];
const int64_t nb2 = src0->nb[2];
const int64_t nb3 = src0->nb[3];
int num_blocks = (ne00 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
sycl::range<3> global(num_blocks * SYCL_CONCAT_BLOCK_SIZE, ne01, ne02*ne03);
sycl::range<3> local(SYCL_CONCAT_BLOCK_SIZE, 1, 1);
stream->parallel_for(
sycl::nd_range<3>(global,
local),
[=](sycl::nd_item<3> item_ct1) { pad_reflect_1d_f32(
(const float *) src0->data, (float *) dst->data,
ne0, ne02, p0, p1,
nb0, nb1, nb2, nb3,
nb00, nb01, nb02, nb03
, item_ct1);
});
}

View File

@ -0,0 +1,8 @@
#ifndef GGML_SYCL_PAD_REFLECT_1D_HPP
#define GGML_SYCL_PAD_REFLECT_1D_HPP
#include "common.hpp"
void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
#endif // GGML_SYCL_PAD_REFLECT_1D_HPP

View File

@ -96,8 +96,6 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
#define GGML_VK_MAX_NODES 8192
#define MAX_VK_BUFFERS 256
#define VK_CHECK(err, msg) \
do { \
vk::Result err_ = (err); \
@ -1311,7 +1309,6 @@ struct ggml_vk_garbage_collector {
std::vector<vk_semaphore> tl_semaphores;
std::vector<vk_semaphore> semaphores;
std::vector<vk::Event> events;
std::vector<vk_buffer> temp_buffers;
std::vector<vk_context> contexts;
};
@ -1482,8 +1479,6 @@ struct ggml_backend_vk_context {
// and set to true after the buffer contents are consumed.
bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync;
vk_buffer buffer_pool[MAX_VK_BUFFERS];
vk_context_ref compute_ctx;
vk_context_ref transfer_ctx;
@ -3623,8 +3618,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1);
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1);
if (device->subgroup_arithmetic && device->subgroup_require_full_support) {
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_subgroup_f32_len, ssm_scan_subgroup_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
} else {
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d128, "ssm_scan_128_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {128, device->subgroup_size, 16}, 1, true, true);
ggml_vk_create_pipeline(device, device->pipeline_ssm_scan_f32_d256, "ssm_scan_256_f32", ssm_scan_f32_len, ssm_scan_f32_data, "main", 8, sizeof(vk_op_ssm_scan_push_constants), {1, 1, 1}, {256, device->subgroup_size, 16}, 1, true, true);
}
ggml_vk_create_pipeline(device, device->pipeline_ssm_conv_f32, "ssm_conv_f32", ssm_conv_f32_len, ssm_conv_f32_data, "main", 3, sizeof(vk_op_ssm_conv_push_constants), {32, 1, 1}, {32}, 1);
@ -5144,71 +5144,6 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
return ctx->device->pipeline_dequant_mul_mat_vec_id_f32[a_type];
}
static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")");
VK_LOG_MEMORY("ggml_vk_pool_malloc");
int best_i = -1;
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
int worst_i = -1;
size_t worst_size = 0; //largest unused buffer seen so far
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
vk_buffer &b = ctx->buffer_pool[i];
if (b != nullptr && b->size >= size && b->size < best_size) {
best_i = i;
best_size = b->size;
}
if (b != nullptr && b->size > worst_size) {
worst_i = i;
worst_size = b->size;
}
}
if(best_i != -1) {
//found the smallest buffer that fits our needs
vk_buffer b = ctx->buffer_pool[best_i];
ctx->buffer_pool[best_i].reset();
return b;
}
if(worst_i != -1) {
//no buffer that fits our needs, resize largest one to save memory
vk_buffer& b = ctx->buffer_pool[worst_i];
ggml_vk_destroy_buffer(b);
}
return ggml_vk_create_buffer_device(ctx->device, size);
}
static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) {
VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")");
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
vk_buffer& b = ctx->buffer_pool[i];
if (b == nullptr) {
b = buffer;
return;
}
}
std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl;
ggml_vk_destroy_buffer(buffer);
}
// Returns an available temporary buffer that may only be used temporarily, it will be reused
static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) {
// Try to find existing temp buffer with enough capacity
for (auto& buffer : ctx->gc.temp_buffers) {
if (buffer->size >= size) {
return buffer;
}
}
VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")");
// Otherwise create new buffer
vk_buffer buf = ggml_vk_pool_malloc(ctx, size);
ctx->gc.temp_buffers.push_back(buf);
return buf;
}
static void * ggml_vk_host_malloc(vk_device& device, size_t size) {
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
vk_buffer buf = ggml_vk_create_buffer(device, size,
@ -11789,10 +11724,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
// Clean up after graph processing is done
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
VK_LOG_DEBUG("ggml_vk_graph_cleanup()");
for (auto& buffer : ctx->gc.temp_buffers) {
ggml_vk_pool_free(ctx, buffer);
}
ctx->gc.temp_buffers.clear();
ctx->prealloc_y_last_pipeline_used = {};
ctx->unsynced_nodes_written.clear();
@ -11835,10 +11766,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
ctx->prealloc_y_last_pipeline_used = nullptr;
for (auto& buffer : ctx->buffer_pool) {
ggml_vk_destroy_buffer(buffer);
}
ctx->prealloc_size_x = 0;
ctx->prealloc_size_y = 0;
ctx->prealloc_size_split_k = 0;

View File

@ -345,7 +345,7 @@ void main() {
float Lfrcp[Br];
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
Lfrcp[r] = 1.0 / Lf[r];
Lfrcp[r] = (Lf[r] == 0.0) ? 0.0 : (1.0 / Lf[r]);
}
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {

View File

@ -380,7 +380,7 @@ void main() {
float Lfrcp[rows_per_thread];
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Lfrcp[r] = 1.0 / Lf[r];
Lfrcp[r] = (Lf[r] == 0.0) ? 0.0 : (1.0 / Lf[r]);
}
[[unroll]] for (uint32_t d = 0; d < HSV_per_thread / 4; ++d) {

View File

@ -121,7 +121,11 @@ void main() {
const float NEG_FLT_MAX_OVER_2 = uintBitsToFloat(0xFEFFFFFF);
L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
#if defined(ACC_TYPE_MAX)
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-ACC_TYPE_MAX / ACC_TYPE(2));
#else
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(NEG_FLT_MAX_OVER_2);
#endif
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> slopeMat = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(1.0);
@ -294,7 +298,7 @@ void main() {
[[unroll]]
for (int k = 0; k < Ldiag.length(); ++k) {
Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k];
Ldiag[k] = (Ldiag[k] == 0.0) ? ACC_TYPE(0.0) : (ACC_TYPE(1.0) / Ldiag[k]);
}
O = Ldiag*O;

View File

@ -91,7 +91,7 @@ void main() {
L = L*ms + vs;
}
L = 1.0 / L;
L = (L == 0.0) ? 0.0 : 1.0 / L;
// D dimension is split across workgroups in the y dimension
uint d = tid + gl_WorkGroupID.y * BLOCK_SIZE;

View File

@ -1,6 +1,9 @@
#version 450
#extension GL_EXT_control_flow_attributes : require
#if USE_SUBGROUP_ADD
#extension GL_KHR_shader_subgroup_arithmetic : enable
#endif
#include "types.glsl"
@ -84,35 +87,47 @@ void main() {
}
barrier();
for (uint w = D_STATE; w > SUBGROUP_SIZE; w >>= 1) {
[[unroll]] for (uint j = 0; j < ((w >> 1) * SPLIT_H + D_STATE - 1) / D_STATE; j++) {
const uint k = (tid % (w >> 1)) +
(D_STATE * (tid / (w >> 1))) +
j * D_STATE * (D_STATE / (w >> 1));
if (k < SPLIT_H * D_STATE && (k + (w >> 1)) < SPLIT_H * D_STATE) {
stateC[k] += stateC[k + (w >> 1)];
[[unroll]]
for (uint w = D_STATE / 2; w >= SUBGROUP_SIZE; w >>= 1) {
[[unroll]] for (uint j = 0; j < (w * SPLIT_H + D_STATE - 1) / D_STATE; j++) {
const uint k = (tid % w) + (D_STATE * (tid / w)) + j * D_STATE * (D_STATE / w);
if (k < SPLIT_H * D_STATE && (k + w) < SPLIT_H * D_STATE) {
stateC[k] += stateC[k + w];
}
}
barrier();
}
[[unroll]] for (uint j = 0; j <= SPLIT_H / (D_STATE / SUBGROUP_SIZE); j++) {
[[unroll]] for (uint j = 0; j < max(1, SPLIT_H / (D_STATE / SUBGROUP_SIZE)); j++) {
const uint idx = (tid % SUBGROUP_SIZE) +
D_STATE * (tid / SUBGROUP_SIZE) +
j * D_STATE * (D_STATE / SUBGROUP_SIZE);
const uint max_idx = SUBGROUP_SIZE - 1 +
D_STATE * ((D_STATE - 1) / SUBGROUP_SIZE) +
j * D_STATE * (D_STATE / SUBGROUP_SIZE);
uint lane = tid % SUBGROUP_SIZE;
if (idx < SPLIT_H * D_STATE ||
max_idx < SPLIT_H * D_STATE) {
float sc;
#if USE_SUBGROUP_ADD
sc = stateC[idx];
sc = subgroupAdd(sc);
#else
[[unroll]] for (uint offset = SUBGROUP_SIZE / 2; offset > 0; offset >>= 1) {
if (idx + offset < SPLIT_H * D_STATE) {
stateC[idx] += stateC[idx + offset];
}
barrier();
}
if (tid % SUBGROUP_SIZE == 0) {
sc = stateC[idx];
}
#endif
if (idx < SPLIT_H * D_STATE && tid % SUBGROUP_SIZE == 0) {
if (tid % SUBGROUP_SIZE == 0) {
const uint k = tid / SUBGROUP_SIZE + j * (D_STATE / SUBGROUP_SIZE);
d[y_base_idx + i * stride_y + k] = stateC[idx];
d[y_base_idx + i * stride_y + k] = sc;
}
}
}

View File

@ -917,6 +917,7 @@ void process_shaders() {
string_to_spv("multi_add_rms_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "1"}});
string_to_spv("ssm_scan_f32", "ssm_scan.comp", {{"A_TYPE", "float"}});
string_to_spv("ssm_scan_subgroup_f32", "ssm_scan.comp", {{"A_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
string_to_spv("ssm_conv_f32", "ssm_conv.comp", {{"A_TYPE", "float"}});

View File

@ -6964,6 +6964,78 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_LOG_INFO("========================================\n");
}
static int ggml_node_list_find_tensor(const struct ggml_cgraph * cgraph,
const int * idxs,
int count,
const struct ggml_tensor * tensor) {
GGML_ASSERT(cgraph && idxs);
for (int i = 0; i < count; ++i) {
const int node_idx = idxs[i];
if (node_idx >= cgraph->n_nodes) {
return -1;
}
if (cgraph->nodes[node_idx] == tensor) {
return i;
}
}
return -1;
}
bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
const int * node_idxs,
int count,
const enum ggml_op * ops,
const int * outputs,
int num_outputs) {
GGML_ASSERT(outputs && num_outputs > 0);
for (int i = 0; i < count; ++i) {
if (node_idxs[i] >= cgraph->n_nodes) {
return false;
}
const struct ggml_tensor * node = cgraph->nodes[node_idxs[i]];
if (node->op != ops[i]) {
return false;
}
if (ggml_node_list_find_tensor(cgraph, outputs, num_outputs, node) != -1) {
continue;
}
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
return false;
}
int subgraph_uses = 0;
for (int j = i + 1; j < count; ++j) {
const struct ggml_tensor * other_node = cgraph->nodes[node_idxs[j]];
for (int src_idx = 0; src_idx < GGML_MAX_SRC; src_idx++) {
if (other_node->src[src_idx] == node) {
subgraph_uses++;
}
}
}
if (subgraph_uses != ggml_node_get_use_count(cgraph, node_idxs[i])) {
return false;
}
// if node is a view, check if the view_src and all it's parent view_srcs are within the subgraph
struct ggml_tensor * view_src = node->view_src;
while (view_src) {
if (ggml_node_list_find_tensor(cgraph, node_idxs, count, view_src) == -1) {
return false;
}
view_src = view_src->view_src;
}
}
return true;
}
// check if node is part of the graph
static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
if (cgraph == NULL) {

View File

@ -14,12 +14,12 @@ except ImportError:
SentencePieceProcessor = None
try:
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from mistral_common.tokens.tokenizers.utils import (
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
_filter_valid_tokenizer_files,
)
from mistral_common.tokens.tokenizers.sentencepiece import (
from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
SentencePieceTokenizer,
)
except ImportError:

View File

@ -1,5 +1,3 @@
mistral-common>=1.8.3
-r ./requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu

View File

@ -0,0 +1 @@
0xffff

View File

@ -0,0 +1,39 @@
#!/bin/sh
#
# Basedir on device
basedir=/data/local/tmp/llama.cpp
branch=.
[ "$B" != "" ] && branch=$B
adbserial=
[ "$S" != "" ] && adbserial="-s $S"
model="Llama-3.2-3B-Instruct-Q4_0.gguf"
[ "$M" != "" ] && model="$M"
device="HTP0"
[ "$D" != "" ] && device="$D"
verbose=""
[ "$V" != "" ] && verbose="$V"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
ndev=
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
set -x
adb $adbserial shell " \
cd $basedir; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$ndev $nhvx $opmask ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
-t 4 --batch-size 128 -ngl 99 $@ \
"

View File

@ -0,0 +1,52 @@
#!/bin/sh
#
# Basedir on device
basedir=/data/local/tmp/llama.cpp
cli_opts=
branch=.
[ "$B" != "" ] && branch=$B
adbserial=
[ "$S" != "" ] && adbserial="-s $S"
model="Llama-3.2-3B-Instruct-Q4_0.gguf"
[ "$M" != "" ] && model="$M"
device="HTP0"
[ "$D" != "" ] && device="$D"
verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
experimental=
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
ndev=
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
set -x
adb $adbserial shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
-t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-ngl 99 --device $device $cli_opts $@ \
"

View File

@ -0,0 +1,51 @@
#!/bin/sh
#
# Basedir on device
basedir=/data/local/tmp/llama.cpp
cli_opts=
branch=.
[ "$B" != "" ] && branch=$B
adbserial=
[ "$S" != "" ] && adbserial="-s $S"
device="HTP0"
[ "$D" != "" ] && device="$D"
verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
experimental=
[ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$V"
sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
ndev=
[ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
hb=
[ "$HB" != "" ] && hb="GGML_HEXAGON_HOSTBUF=$HB"
set -x
tool=$1; shift
adb $adbserial shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev $hb ./$branch/bin/$tool $@ \
"

View File

@ -0,0 +1 @@
This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).

View File

@ -0,0 +1,25 @@
Appium-Python-Client==5.2.4
attrs==25.4.0
certifi==2025.10.5
exceptiongroup==1.3.0
h11==0.16.0
idna==3.11
iniconfig==2.1.0
outcome==1.3.0.post0
packaging==25.0
pluggy==1.6.0
Pygments==2.19.2
PySocks==1.7.1
pytest==8.4.2
pytest-dependency==0.6.0
selenium==4.36.0
setuptools==80.9.0
sniffio==1.3.1
sortedcontainers==2.4.0
tomli==2.3.0
trio==0.31.0
trio-websocket==0.12.2
typing_extensions==4.15.0
urllib3==2.5.0
websocket-client==1.9.0
wsproto==1.2.0

View File

@ -0,0 +1,63 @@
import pytest
import subprocess
import sys
tmp_path='/data/local/tmp'
pkg_path=f'{tmp_path}/llama.cpp'
lib_path=f'{pkg_path}/lib'
bin_path=f'{pkg_path}/bin'
model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
def run_cmd(cmd):
p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
sys.stdout.write(p.stdout)
assert(p.returncode == 0)
@pytest.mark.dependency()
def test_install():
run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
## Basic cli tests
def run_llama_cli(dev, opts):
prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_cpu():
run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_gpu():
run_llama_cli('GPUOpenCL', '-fa on')
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_npu():
run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
## Basic bench tests
def run_llama_bench(dev):
run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
@pytest.mark.dependency(depends=['test_install'])
def test_llama_bench_cpu():
run_llama_bench('none')
def test_llama_bench_gpu():
run_llama_bench('GPUOpenCL')
def test_llama_bench_npu():
run_llama_bench('HTP0')

View File

@ -404,6 +404,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
// add the device default buffer type
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
// add the device extra buffer type (if any)
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(dev, *extra_bufts);
++extra_bufts;
}
}
return buft_list;
}
@ -17952,6 +17965,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);

View File

@ -4669,14 +4669,21 @@ struct test_topk_moe: public test_case {
const std::array<int64_t, 4> ne;
const int n_expert_used;
const bool with_norm;
test_topk_moe(std::array<int64_t, 4> ne = {10, 5, 1, 1}, int n_expert_used = 1, bool with_norm = false)
: ne(ne), n_expert_used(n_expert_used), with_norm(with_norm) {
const bool delayed_softmax;
test_topk_moe(std::array<int64_t, 4> ne = { 10, 5, 1, 1 },
int n_expert_used = 1,
bool with_norm = false,
bool delayed_softmax = false) :
ne(ne),
n_expert_used(n_expert_used),
with_norm(with_norm),
delayed_softmax(delayed_softmax) {
GGML_ASSERT(n_expert_used <= ne[0]);
GGML_ASSERT(!(with_norm && delayed_softmax));
}
std::string vars() override {
return VARS_TO_STR3(ne, n_expert_used, with_norm);
}
std::string vars() override { return VARS_TO_STR4(ne, n_expert_used, with_norm, delayed_softmax); }
std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
@ -4690,11 +4697,17 @@ struct test_topk_moe: public test_case {
const int n_tokens = ne[1];
ggml_tensor * logits = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
ggml_tensor * probs = ggml_soft_max(ctx, logits);
ggml_tensor * probs = delayed_softmax ? logits : ggml_soft_max(ctx, logits);
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
ggml_tensor * out = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
if (delayed_softmax) {
out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens);
out = ggml_soft_max(ctx, out); // [n_expert_used, n_tokens]
out = ggml_reshape_3d(ctx, out, 1, n_expert_used, n_tokens);
}
if (with_norm) {
out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens);
ggml_tensor * weights_sum = ggml_sum_rows(ctx, out); // [1, n_tokens]
@ -6394,6 +6407,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
add_test_bin_bcast(type, {1, 1, 640, 1}, {32, 32, 1, 1});
add_test_bin_bcast(type, {5120, 1, 1, 1}, {1, 256, 1, 1});
add_test_bin_bcast(type, {640, 1, 1, 1}, {1, 1, 1, 1});
add_test_bin_bcast(type, {64, 262144, 1, 1}, {1, 1, 1, 1});
//add_test_bin_bcast(type, {3, 3, 2560, 1280}, {1, 1, 1, 1});
//add_test_bin_bcast(type, {3, 3, 2560, 1280}, {2, 1, 1, 1});
}
@ -6975,6 +6989,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
}
test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
test_cases.emplace_back(new test_topk_moe({ 32, 22, 1, 1 }, 8, /*with_norm*/ false, /*delayed_softmax*/ true));
#if 0
// these tests are disabled to save execution time, sbut they can be handy for debugging
test_cases.emplace_back(new test_llama(2, true));

View File

@ -3,6 +3,7 @@
// - Creates n_parallel (--parallel) contexts per model
// - Runs inference in parallel on each context
#include <array>
#include <thread>
#include <vector>
#include <atomic>
@ -38,13 +39,14 @@ int main(int argc, char ** argv) {
cparams.n_seq_max = 1;
int dev_count = ggml_backend_dev_count();
int gpu_dev_count = 0;
std::vector<std::array<ggml_backend_dev_t, 2>> gpus;
for (int i = 0; i < dev_count; ++i) {
auto * dev = ggml_backend_dev_get(i);
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
gpu_dev_count++;
gpus.push_back({dev, nullptr});
}
}
const int gpu_dev_count = (int)gpus.size();
const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split
//const int num_models = std::max(1, gpu_dev_count);
const int num_contexts = std::max(1, params.n_parallel);
@ -58,12 +60,12 @@ int main(int argc, char ** argv) {
if (m < gpu_dev_count) {
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = m;
mparams.devices = gpus[m].data();
} else if (m == gpu_dev_count) {
mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = -1; // CPU model
} else {
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;
}
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

View File

@ -6,3 +6,8 @@ target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
# AIX's flock() function comes from libbsd.a
target_link_libraries(${TARGET} PRIVATE -lbsd)
endif()

View File

@ -76,9 +76,11 @@ struct mtmd_cli_context {
mtmd::bitmaps bitmaps;
// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
// so here we don't need to keep track of chat history
// chat template
common_chat_templates_ptr tmpls;
std::vector<common_chat_msg> chat_history;
bool use_jinja = false;
// TODO: support for --system-prompt with /clear command
// support for legacy templates (models not having EOT token)
llama_tokens antiprompt_tokens;
@ -108,6 +110,8 @@ struct mtmd_cli_context {
}
tmpls = common_chat_templates_init(model, params.chat_template);
use_jinja = params.use_jinja;
chat_history.clear();
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
init_vision_context(params);
@ -193,19 +197,33 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
return 1;
}
}
std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
common_chat_msg msg;
msg.role = "assistant";
msg.content = generated_text;
ctx.chat_history.push_back(std::move(msg));
return 0;
}
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
common_chat_templates_inputs tmpl_inputs;
tmpl_inputs.messages = {msg};
tmpl_inputs.add_generation_prompt = true;
tmpl_inputs.use_jinja = false; // jinja is buggy here
auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
new_msg.role.c_str(), new_msg.content.c_str());
auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
new_msg, new_msg.role == "user",
ctx.use_jinja);
ctx.chat_history.push_back(new_msg);
return formatted;
}
static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
bool add_bos = ctx.chat_history.empty();
auto formatted_chat = chat_add_and_format(ctx, msg);
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
mtmd_input_text text;
text.text = formatted_chat.prompt.c_str();
text.text = formatted_chat.c_str();
text.add_special = add_bos;
text.parse_special = true;
@ -303,7 +321,7 @@ int main(int argc, char ** argv) {
return 1; // error is already printed by libmtmd
}
}
if (eval_message(ctx, msg, true)) {
if (eval_message(ctx, msg)) {
return 1;
}
if (!g_is_interrupted && generate_response(ctx, n_predict)) {
@ -322,7 +340,6 @@ int main(int argc, char ** argv) {
LOG("\n /quit or /exit exit the program");
LOG("\n");
bool is_first_msg = true;
std::string content;
while (!g_is_interrupted) {
@ -342,7 +359,8 @@ int main(int argc, char ** argv) {
}
if (line == "/clear") {
ctx.n_past = 0;
llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
ctx.chat_history.clear();
llama_memory_clear(llama_get_memory(ctx.lctx), true);
LOG("Chat history cleared\n\n");
continue;
}
@ -367,7 +385,7 @@ int main(int argc, char ** argv) {
common_chat_msg msg;
msg.role = "user";
msg.content = content;
int ret = eval_message(ctx, msg, is_first_msg);
int ret = eval_message(ctx, msg);
if (ret) {
return 1;
}
@ -376,7 +394,6 @@ int main(int argc, char ** argv) {
return 1;
}
content.clear();
is_first_msg = false;
}
}
if (g_is_interrupted) LOG("\nInterrupted by user\n");

View File

@ -13,5 +13,11 @@ endif ()
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "AIX")
# AIX's flock() function comes from libbsd.a
target_link_libraries(${TARGET} PRIVATE -lbsd)
endif()
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

Binary file not shown.

View File

@ -2839,7 +2839,7 @@ struct server_context {
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
} else if (slot.has_next_token) {
} else if (slot.has_next_token && !llama_vocab_is_eog(vocab, result.tok) ) {
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
send_text = stop_pos == std::string::npos;
}
@ -5714,6 +5714,7 @@ int main(int argc, char ** argv) {
clean_up();
t.join();
llama_memory_breakdown_print(ctx_server.ctx);
return 0;
}

View File

@ -3,6 +3,8 @@
import { Button } from '$lib/components/ui/button';
import ChatFormActionFileAttachments from './ChatFormActionFileAttachments.svelte';
import ChatFormActionRecord from './ChatFormActionRecord.svelte';
import ChatFormModelSelector from './ChatFormModelSelector.svelte';
import { config } from '$lib/stores/settings.svelte';
import type { FileTypeCategory } from '$lib/enums/files';
interface Props {
@ -26,12 +28,17 @@
onMicClick,
onStop
}: Props = $props();
let currentConfig = $derived(config());
</script>
<div class="flex items-center justify-between gap-1 {className}">
<ChatFormActionFileAttachments {disabled} {onFileUpload} />
<div class="flex w-full items-center gap-2 {className}">
<ChatFormActionFileAttachments class="mr-auto" {disabled} {onFileUpload} />
{#if currentConfig.modelSelectorEnabled}
<ChatFormModelSelector class="shrink-0" />
{/if}
<div class="flex gap-2">
{#if isLoading}
<Button
type="button"
@ -54,4 +61,3 @@
</Button>
{/if}
</div>
</div>

View File

@ -0,0 +1,358 @@
<script lang="ts">
import { onMount, tick } from 'svelte';
import { ChevronDown, Loader2 } from '@lucide/svelte';
import { cn } from '$lib/components/ui/utils';
import { portalToBody } from '$lib/utils/portal-to-body';
import {
fetchModels,
modelOptions,
modelsError,
modelsLoading,
modelsUpdating,
selectModel,
selectedModelId
} from '$lib/stores/models.svelte';
import type { ModelOption } from '$lib/types/models';
interface Props {
class?: string;
}
let { class: className = '' }: Props = $props();
let options = $derived(modelOptions());
let loading = $derived(modelsLoading());
let updating = $derived(modelsUpdating());
let error = $derived(modelsError());
let activeId = $derived(selectedModelId());
let isMounted = $state(false);
let isOpen = $state(false);
let container: HTMLDivElement | null = null;
let triggerButton = $state<HTMLButtonElement | null>(null);
let menuRef = $state<HTMLDivElement | null>(null);
let menuPosition = $state<{
top: number;
left: number;
width: number;
placement: 'top' | 'bottom';
maxHeight: number;
} | null>(null);
let lockedWidth: number | null = null;
onMount(async () => {
try {
await fetchModels();
} catch (error) {
console.error('Unable to load models:', error);
} finally {
isMounted = true;
}
});
function handlePointerDown(event: PointerEvent) {
if (!container) return;
const target = event.target as Node | null;
if (target && !container.contains(target) && !(menuRef && menuRef.contains(target))) {
closeMenu();
}
}
function handleKeydown(event: KeyboardEvent) {
if (event.key === 'Escape') {
closeMenu();
}
}
function handleResize() {
if (isOpen) {
updateMenuPosition();
}
}
function handleScroll() {
if (isOpen) {
updateMenuPosition();
}
}
async function handleSelect(value: string | undefined) {
if (!value) return;
const option = options.find((item) => item.id === value);
if (!option) {
console.error('Model is no longer available');
return;
}
try {
await selectModel(option.id);
} catch (error) {
console.error('Failed to switch model:', error);
}
}
const VIEWPORT_GUTTER = 8;
const MENU_OFFSET = 6;
const MENU_MAX_WIDTH = 320;
async function openMenu() {
if (loading || updating) return;
isOpen = true;
await tick();
updateMenuPosition();
requestAnimationFrame(() => updateMenuPosition());
}
function toggleOpen() {
if (loading || updating) return;
if (isOpen) {
closeMenu();
} else {
void openMenu();
}
}
function closeMenu() {
if (!isOpen) return;
isOpen = false;
menuPosition = null;
lockedWidth = null;
}
async function handleOptionSelect(optionId: string) {
try {
await handleSelect(optionId);
} finally {
closeMenu();
}
}
$effect(() => {
if (loading || updating) {
closeMenu();
}
});
$effect(() => {
const optionCount = options.length;
if (!isOpen || optionCount <= 0) return;
queueMicrotask(() => updateMenuPosition());
});
function updateMenuPosition() {
if (!isOpen || !triggerButton || !menuRef) return;
const triggerRect = triggerButton.getBoundingClientRect();
const viewportWidth = window.innerWidth;
const viewportHeight = window.innerHeight;
if (viewportWidth === 0 || viewportHeight === 0) return;
const scrollWidth = menuRef.scrollWidth;
const scrollHeight = menuRef.scrollHeight;
const availableWidth = Math.max(0, viewportWidth - VIEWPORT_GUTTER * 2);
const constrainedMaxWidth = Math.min(MENU_MAX_WIDTH, availableWidth || MENU_MAX_WIDTH);
const safeMaxWidth =
constrainedMaxWidth > 0 ? constrainedMaxWidth : Math.min(MENU_MAX_WIDTH, viewportWidth);
const desiredMinWidth = Math.min(160, safeMaxWidth || 160);
let width = lockedWidth;
if (width === null) {
const naturalWidth = Math.min(scrollWidth, safeMaxWidth);
const baseWidth = Math.max(triggerRect.width, naturalWidth, desiredMinWidth);
width = Math.min(baseWidth, safeMaxWidth || baseWidth);
lockedWidth = width;
} else {
width = Math.min(Math.max(width, desiredMinWidth), safeMaxWidth || width);
}
if (width > 0) {
menuRef.style.width = `${width}px`;
}
const availableBelow = Math.max(
0,
viewportHeight - VIEWPORT_GUTTER - triggerRect.bottom - MENU_OFFSET
);
const availableAbove = Math.max(0, triggerRect.top - VIEWPORT_GUTTER - MENU_OFFSET);
const viewportAllowance = Math.max(0, viewportHeight - VIEWPORT_GUTTER * 2);
const fallbackAllowance = Math.max(1, viewportAllowance > 0 ? viewportAllowance : scrollHeight);
function computePlacement(placement: 'top' | 'bottom') {
const available = placement === 'bottom' ? availableBelow : availableAbove;
const allowedHeight =
available > 0 ? Math.min(available, fallbackAllowance) : fallbackAllowance;
const maxHeight = Math.min(scrollHeight, allowedHeight);
const height = Math.max(0, maxHeight);
let top: number;
if (placement === 'bottom') {
const rawTop = triggerRect.bottom + MENU_OFFSET;
const minTop = VIEWPORT_GUTTER;
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
if (maxTop < minTop) {
top = minTop;
} else {
top = Math.min(Math.max(rawTop, minTop), maxTop);
}
} else {
const rawTop = triggerRect.top - MENU_OFFSET - height;
const minTop = VIEWPORT_GUTTER;
const maxTop = viewportHeight - VIEWPORT_GUTTER - height;
if (maxTop < minTop) {
top = minTop;
} else {
top = Math.max(Math.min(rawTop, maxTop), minTop);
}
}
return { placement, top, height, maxHeight };
}
const belowMetrics = computePlacement('bottom');
const aboveMetrics = computePlacement('top');
let metrics = belowMetrics;
if (scrollHeight > belowMetrics.maxHeight && aboveMetrics.maxHeight > belowMetrics.maxHeight) {
metrics = aboveMetrics;
}
menuRef.style.maxHeight = metrics.maxHeight > 0 ? `${Math.round(metrics.maxHeight)}px` : '';
let left = triggerRect.right - width;
const maxLeft = viewportWidth - VIEWPORT_GUTTER - width;
if (maxLeft < VIEWPORT_GUTTER) {
left = VIEWPORT_GUTTER;
} else {
if (left > maxLeft) {
left = maxLeft;
}
if (left < VIEWPORT_GUTTER) {
left = VIEWPORT_GUTTER;
}
}
menuPosition = {
top: Math.round(metrics.top),
left: Math.round(left),
width: Math.round(width),
placement: metrics.placement,
maxHeight: Math.round(metrics.maxHeight)
};
}
function getDisplayOption(): ModelOption | undefined {
if (activeId) {
return options.find((option) => option.id === activeId);
}
return options[0];
}
</script>
<svelte:window onresize={handleResize} onscroll={handleScroll} />
<svelte:document onpointerdown={handlePointerDown} onkeydown={handleKeydown} />
<div
class={cn('relative z-10 flex max-w-[200px] min-w-[120px] flex-col items-end gap-1', className)}
bind:this={container}
>
{#if loading && options.length === 0 && !isMounted}
<div class="flex items-center gap-2 text-xs text-muted-foreground">
<Loader2 class="h-4 w-4 animate-spin" />
Loading models…
</div>
{:else if options.length === 0}
<p class="text-xs text-muted-foreground">No models available.</p>
{:else}
{@const selectedOption = getDisplayOption()}
<div class="relative w-full">
<button
type="button"
class={cn(
'flex w-full items-center justify-end gap-2 rounded-md px-2 py-1 text-sm text-muted-foreground transition hover:text-foreground focus:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-60',
isOpen ? 'text-foreground' : ''
)}
aria-haspopup="listbox"
aria-expanded={isOpen}
onclick={toggleOpen}
bind:this={triggerButton}
disabled={loading || updating}
>
<span class="max-w-[160px] truncate text-right font-medium">
{selectedOption?.name || 'Select model'}
</span>
{#if updating}
<Loader2 class="h-3.5 w-3.5 animate-spin text-muted-foreground" />
{:else}
<ChevronDown
class={cn(
'h-4 w-4 text-muted-foreground transition-transform',
isOpen ? 'rotate-180 text-foreground' : ''
)}
/>
{/if}
</button>
{#if isOpen}
<div
bind:this={menuRef}
use:portalToBody
class={cn(
'fixed z-[1000] overflow-hidden rounded-md border bg-popover shadow-lg transition-opacity',
menuPosition ? 'opacity-100' : 'pointer-events-none opacity-0'
)}
role="listbox"
style:top={menuPosition ? `${menuPosition.top}px` : undefined}
style:left={menuPosition ? `${menuPosition.left}px` : undefined}
style:width={menuPosition ? `${menuPosition.width}px` : undefined}
data-placement={menuPosition?.placement ?? 'bottom'}
>
<div
class="overflow-y-auto py-1"
style:max-height={menuPosition && menuPosition.maxHeight > 0
? `${menuPosition.maxHeight}px`
: undefined}
>
{#each options as option (option.id)}
<button
type="button"
class={cn(
'flex w-full flex-col items-start gap-0.5 px-3 py-2 text-left text-sm transition hover:bg-muted focus:bg-muted focus:outline-none',
option.id === selectedOption?.id ? 'bg-accent text-accent-foreground' : ''
)}
role="option"
aria-selected={option.id === selectedOption?.id}
onclick={() => handleOptionSelect(option.id)}
>
<span class="block w-full truncate font-medium" title={option.name}>
{option.name}
</span>
{#if option.description}
<span class="text-xs text-muted-foreground">{option.description}</span>
{/if}
</button>
{/each}
</div>
</div>
{/if}
</div>
{/if}
{#if error}
<p class="text-xs text-destructive">{error}</p>
{/if}
</div>

View File

@ -10,6 +10,7 @@
import ChatMessageActions from './ChatMessageActions.svelte';
import Label from '$lib/components/ui/label/label.svelte';
import { config } from '$lib/stores/settings.svelte';
import { modelName as serverModelName } from '$lib/stores/server.svelte';
import { copyToClipboard } from '$lib/utils/copy';
interface Props {
@ -70,6 +71,23 @@
}: Props = $props();
const processingState = useProcessingState();
let currentConfig = $derived(config());
let serverModel = $derived(serverModelName());
let displayedModel = $derived((): string | null => {
if (!currentConfig.showModelInfo) return null;
if (currentConfig.modelSelectorEnabled) {
return message.model ?? null;
}
return serverModel;
});
function handleCopyModel() {
const model = displayedModel();
void copyToClipboard(model ?? '');
}
</script>
<div
@ -142,7 +160,7 @@
</div>
{/if}
{#if config().showModelInfo && message.model}
{#if displayedModel()}
<span class="mt-6 mb-4 inline-flex items-center gap-1 text-xs text-muted-foreground">
<Package class="h-3.5 w-3.5" />
@ -150,9 +168,9 @@
<button
class="inline-flex cursor-pointer items-center gap-1 rounded-sm bg-muted-foreground/15 px-1.5 py-0.75"
onclick={() => copyToClipboard(message.model)}
onclick={handleCopyModel}
>
{message.model}
{displayedModel()}
<Copy class="ml-1 h-3 w-3 " />
</button>

View File

@ -216,6 +216,11 @@
title: 'Developer',
icon: Code,
fields: [
{
key: 'modelSelectorEnabled',
label: 'Enable model selector',
type: 'checkbox'
},
{
key: 'disableReasoningFormat',
label: 'Show raw LLM output',

View File

@ -8,6 +8,7 @@ export { default as ChatFormTextarea } from './chat/ChatForm/ChatFormTextarea.sv
export { default as ChatFormActions } from './chat/ChatForm/ChatFormActions.svelte';
export { default as ChatFormActionFileAttachments } from './chat/ChatForm/ChatFormActionFileAttachments.svelte';
export { default as ChatFormActionRecord } from './chat/ChatForm/ChatFormActionRecord.svelte';
export { default as ChatFormModelSelector } from './chat/ChatForm/ChatFormModelSelector.svelte';
export { default as ChatFormHelperText } from './chat/ChatForm/ChatFormHelperText.svelte';
export { default as ChatFormFileInputInvisible } from './chat/ChatForm/ChatFormFileInputInvisible.svelte';
@ -32,7 +33,6 @@ export { default as ParameterSourceIndicator } from './chat/ChatSettings/Paramet
export { default as ChatSidebar } from './chat/ChatSidebar/ChatSidebar.svelte';
export { default as ChatSidebarConversationItem } from './chat/ChatSidebar/ChatSidebarConversationItem.svelte';
export { default as ChatSidebarSearch } from './chat/ChatSidebar/ChatSidebarSearch.svelte';
export { default as ChatErrorDialog } from './dialogs/ChatErrorDialog.svelte';
export { default as EmptyFileAlertDialog } from './dialogs/EmptyFileAlertDialog.svelte';

View File

@ -8,22 +8,33 @@
class: className,
children,
size = 'default',
variant = 'default',
...restProps
}: WithoutChild<SelectPrimitive.TriggerProps> & {
size?: 'sm' | 'default';
variant?: 'default' | 'plain';
} = $props();
const baseClasses = $derived(
variant === 'plain'
? "group inline-flex w-full items-center justify-end gap-2 whitespace-nowrap px-0 py-0 text-sm font-medium text-muted-foreground transition-colors focus-visible:outline-none focus-visible:ring-0 focus-visible:ring-offset-0 disabled:cursor-not-allowed disabled:opacity-50 data-[placeholder]:text-muted-foreground data-[size=default]:h-9 data-[size=sm]:h-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-3 [&_svg:not([class*='text-'])]:text-muted-foreground"
: "flex w-fit items-center justify-between gap-2 rounded-md border border-input bg-transparent px-3 py-2 text-sm whitespace-nowrap shadow-xs transition-[color,box-shadow] outline-none select-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 aria-invalid:border-destructive aria-invalid:ring-destructive/20 data-[placeholder]:text-muted-foreground data-[size=default]:h-9 data-[size=sm]:h-8 *:data-[slot=select-value]:line-clamp-1 *:data-[slot=select-value]:flex *:data-[slot=select-value]:items-center *:data-[slot=select-value]:gap-2 dark:bg-input/30 dark:hover:bg-input/50 dark:aria-invalid:ring-destructive/40 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground"
);
const chevronClasses = $derived(
variant === 'plain'
? 'size-3 opacity-60 transition-transform group-data-[state=open]:-rotate-180'
: 'size-4 opacity-50'
);
</script>
<SelectPrimitive.Trigger
bind:ref
data-slot="select-trigger"
data-size={size}
class={cn(
"flex w-fit items-center justify-between gap-2 rounded-md border border-input bg-transparent px-3 py-2 text-sm whitespace-nowrap shadow-xs transition-[color,box-shadow] outline-none select-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 aria-invalid:border-destructive aria-invalid:ring-destructive/20 data-[placeholder]:text-muted-foreground data-[size=default]:h-9 data-[size=sm]:h-8 *:data-[slot=select-value]:line-clamp-1 *:data-[slot=select-value]:flex *:data-[slot=select-value]:items-center *:data-[slot=select-value]:gap-2 dark:bg-input/30 dark:hover:bg-input/50 dark:aria-invalid:ring-destructive/40 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4 [&_svg:not([class*='text-'])]:text-muted-foreground",
className
)}
class={cn(baseClasses, className)}
{...restProps}
>
{@render children?.()}
<ChevronDownIcon class="size-4 opacity-50" />
<ChevronDownIcon class={chevronClasses} />
</SelectPrimitive.Trigger>

View File

@ -1 +1,2 @@
export const SERVER_PROPS_LOCALSTORAGE_KEY = 'LlamaCppWebui.serverProps';
export const SELECTED_MODEL_LOCALSTORAGE_KEY = 'LlamaCppWebui.selectedModel';

View File

@ -13,6 +13,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
pdfAsImage: false,
showModelInfo: false,
renderUserContentAsMarkdown: false,
modelSelectorEnabled: false,
// make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature',
temperature: 0.8,
@ -86,6 +87,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).',
showModelInfo: 'Display the model name used to generate each message below the message content.',
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
modelSelectorEnabled:
'Enable the model selector in the chat input to choose the inference model. Sends the associated model field in API requests.',
pyInterpreterEnabled:
'Enable Python interpreter using Pyodide. Allows running Python code in markdown code blocks.'
};

View File

@ -1,4 +1,5 @@
import { config } from '$lib/stores/settings.svelte';
import { selectedModelName } from '$lib/stores/models.svelte';
import { slotsService } from './slots';
/**
* ChatService - Low-level API communication layer for llama.cpp server interactions
@ -51,6 +52,8 @@ export class ChatService {
onChunk,
onComplete,
onError,
onReasoningChunk,
onModel,
// Generation parameters
temperature,
max_tokens,
@ -118,6 +121,13 @@ export class ChatService {
stream
};
const modelSelectorEnabled = Boolean(currentConfig.modelSelectorEnabled);
const activeModel = modelSelectorEnabled ? selectedModelName() : null;
if (modelSelectorEnabled && activeModel) {
requestBody.model = activeModel;
}
requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
if (temperature !== undefined) requestBody.temperature = temperature;
@ -189,13 +199,14 @@ export class ChatService {
onChunk,
onComplete,
onError,
options.onReasoningChunk,
onReasoningChunk,
onModel,
conversationId,
abortController.signal
);
return;
} else {
return this.handleNonStreamResponse(response, onComplete, onError);
return this.handleNonStreamResponse(response, onComplete, onError, onModel);
}
} catch (error) {
if (error instanceof Error && error.name === 'AbortError') {
@ -255,6 +266,7 @@ export class ChatService {
) => void,
onError?: (error: Error) => void,
onReasoningChunk?: (chunk: string) => void,
onModel?: (model: string) => void,
conversationId?: string,
abortSignal?: AbortSignal
): Promise<void> {
@ -270,6 +282,7 @@ export class ChatService {
let hasReceivedData = false;
let lastTimings: ChatMessageTimings | undefined;
let streamFinished = false;
let modelEmitted = false;
try {
let chunk = '';
@ -298,6 +311,12 @@ export class ChatService {
try {
const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
const chunkModel = this.extractModelName(parsed);
if (chunkModel && !modelEmitted) {
modelEmitted = true;
onModel?.(chunkModel);
}
const content = parsed.choices[0]?.delta?.content;
const reasoningContent = parsed.choices[0]?.delta?.reasoning_content;
const timings = parsed.timings;
@ -372,7 +391,8 @@ export class ChatService {
reasoningContent?: string,
timings?: ChatMessageTimings
) => void,
onError?: (error: Error) => void
onError?: (error: Error) => void,
onModel?: (model: string) => void
): Promise<string> {
try {
const responseText = await response.text();
@ -383,6 +403,12 @@ export class ChatService {
}
const data: ApiChatCompletionResponse = JSON.parse(responseText);
const responseModel = this.extractModelName(data);
if (responseModel) {
onModel?.(responseModel);
}
const content = data.choices[0]?.message?.content || '';
const reasoningContent = data.choices[0]?.message?.reasoning_content;
@ -625,6 +651,39 @@ export class ChatService {
}
}
private extractModelName(data: unknown): string | undefined {
const asRecord = (value: unknown): Record<string, unknown> | undefined => {
return typeof value === 'object' && value !== null
? (value as Record<string, unknown>)
: undefined;
};
const getTrimmedString = (value: unknown): string | undefined => {
return typeof value === 'string' && value.trim() ? value.trim() : undefined;
};
const root = asRecord(data);
if (!root) return undefined;
// 1) root (some implementations provide `model` at the top level)
const rootModel = getTrimmedString(root.model);
if (rootModel) return rootModel;
// 2) streaming choice (delta) or final response (message)
const firstChoice = Array.isArray(root.choices) ? asRecord(root.choices[0]) : undefined;
if (!firstChoice) return undefined;
// priority: delta.model (first chunk) else message.model (final response)
const deltaModel = getTrimmedString(asRecord(firstChoice.delta)?.model);
if (deltaModel) return deltaModel;
const messageModel = getTrimmedString(asRecord(firstChoice.message)?.model);
if (messageModel) return messageModel;
// avoid guessing from non-standard locations (metadata, etc.)
return undefined;
}
private updateProcessingState(
timings?: ChatMessageTimings,
promptProgress?: ChatMessagePromptProgress,

View File

@ -0,0 +1,22 @@
import { base } from '$app/paths';
import { config } from '$lib/stores/settings.svelte';
import type { ApiModelListResponse } from '$lib/types/api';
export class ModelsService {
static async list(): Promise<ApiModelListResponse> {
const currentConfig = config();
const apiKey = currentConfig.apiKey?.toString().trim();
const response = await fetch(`${base}/v1/models`, {
headers: {
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
}
});
if (!response.ok) {
throw new Error(`Failed to fetch model list (status ${response.status})`);
}
return response.json() as Promise<ApiModelListResponse>;
}
}

View File

@ -1,7 +1,7 @@
import { DatabaseStore } from '$lib/stores/database';
import { chatService, slotsService } from '$lib/services';
import { serverStore } from '$lib/stores/server.svelte';
import { config } from '$lib/stores/settings.svelte';
import { normalizeModelName } from '$lib/utils/model-names';
import { filterByLeafNodeId, findLeafNode, findDescendantMessages } from '$lib/utils/branching';
import { browser } from '$app/environment';
import { goto } from '$app/navigation';
@ -359,28 +359,33 @@ class ChatStore {
): Promise<void> {
let streamedContent = '';
let streamedReasoningContent = '';
let modelCaptured = false;
const captureModelIfNeeded = (updateDbImmediately = true): string | undefined => {
if (!modelCaptured) {
const currentModelName = serverStore.modelName;
let resolvedModel: string | null = null;
let modelPersisted = false;
if (currentModelName) {
if (updateDbImmediately) {
DatabaseStore.updateMessage(assistantMessage.id, { model: currentModelName }).catch(
console.error
);
const recordModel = (modelName: string, persistImmediately = true): void => {
const normalizedModel = normalizeModelName(modelName);
if (!normalizedModel || normalizedModel === resolvedModel) {
return;
}
resolvedModel = normalizedModel;
const messageIndex = this.findMessageIndex(assistantMessage.id);
this.updateMessageAtIndex(messageIndex, { model: currentModelName });
modelCaptured = true;
this.updateMessageAtIndex(messageIndex, { model: normalizedModel });
return currentModelName;
if (persistImmediately && !modelPersisted) {
modelPersisted = true;
DatabaseStore.updateMessage(assistantMessage.id, { model: normalizedModel }).catch(
(error) => {
console.error('Failed to persist model name:', error);
modelPersisted = false;
resolvedModel = null;
}
);
}
return undefined;
};
slotsService.startStreaming();
@ -399,7 +404,6 @@ class ChatStore {
assistantMessage.id
);
captureModelIfNeeded();
const messageIndex = this.findMessageIndex(assistantMessage.id);
this.updateMessageAtIndex(messageIndex, {
content: streamedContent
@ -409,13 +413,15 @@ class ChatStore {
onReasoningChunk: (reasoningChunk: string) => {
streamedReasoningContent += reasoningChunk;
captureModelIfNeeded();
const messageIndex = this.findMessageIndex(assistantMessage.id);
this.updateMessageAtIndex(messageIndex, { thinking: streamedReasoningContent });
},
onModel: (modelName: string) => {
recordModel(modelName);
},
onComplete: async (
finalContent?: string,
reasoningContent?: string,
@ -434,10 +440,9 @@ class ChatStore {
timings: timings
};
const capturedModel = captureModelIfNeeded(false);
if (capturedModel) {
updateData.model = capturedModel;
if (resolvedModel && !modelPersisted) {
updateData.model = resolvedModel;
modelPersisted = true;
}
await DatabaseStore.updateMessage(assistantMessage.id, updateData);
@ -565,7 +570,8 @@ class ChatStore {
content: '',
timestamp: Date.now(),
thinking: '',
children: []
children: [],
model: null
},
parentId || null
);
@ -1533,7 +1539,8 @@ class ChatStore {
role: 'assistant',
content: '',
thinking: '',
children: []
children: [],
model: null
},
parentMessage.id
);
@ -1590,7 +1597,8 @@ class ChatStore {
role: 'assistant',
content: '',
thinking: '',
children: []
children: [],
model: null
},
userMessageId
);

View File

@ -0,0 +1,187 @@
import { ModelsService } from '$lib/services/models';
import { persisted } from '$lib/stores/persisted.svelte';
import { SELECTED_MODEL_LOCALSTORAGE_KEY } from '$lib/constants/localstorage-keys';
import type { ModelOption } from '$lib/types/models';
type PersistedModelSelection = {
id: string;
model: string;
};
class ModelsStore {
private _models = $state<ModelOption[]>([]);
private _loading = $state(false);
private _updating = $state(false);
private _error = $state<string | null>(null);
private _selectedModelId = $state<string | null>(null);
private _selectedModelName = $state<string | null>(null);
private _persistedSelection = persisted<PersistedModelSelection | null>(
SELECTED_MODEL_LOCALSTORAGE_KEY,
null
);
constructor() {
const persisted = this._persistedSelection.value;
if (persisted) {
this._selectedModelId = persisted.id;
this._selectedModelName = persisted.model;
}
}
get models(): ModelOption[] {
return this._models;
}
get loading(): boolean {
return this._loading;
}
get updating(): boolean {
return this._updating;
}
get error(): string | null {
return this._error;
}
get selectedModelId(): string | null {
return this._selectedModelId;
}
get selectedModelName(): string | null {
return this._selectedModelName;
}
get selectedModel(): ModelOption | null {
if (!this._selectedModelId) {
return null;
}
return this._models.find((model) => model.id === this._selectedModelId) ?? null;
}
async fetch(force = false): Promise<void> {
if (this._loading) return;
if (this._models.length > 0 && !force) return;
this._loading = true;
this._error = null;
try {
const response = await ModelsService.list();
const models: ModelOption[] = response.data.map((item, index) => {
const details = response.models?.[index];
const rawCapabilities = Array.isArray(details?.capabilities) ? details?.capabilities : [];
const displayNameSource =
details?.name && details.name.trim().length > 0 ? details.name : item.id;
const displayName = this.toDisplayName(displayNameSource);
return {
id: item.id,
name: displayName,
model: details?.model || item.id,
description: details?.description,
capabilities: rawCapabilities.filter((value): value is string => Boolean(value)),
details: details?.details,
meta: item.meta ?? null
} satisfies ModelOption;
});
this._models = models;
const selection = this.determineInitialSelection(models);
this._selectedModelId = selection.id;
this._selectedModelName = selection.model;
this._persistedSelection.value =
selection.id && selection.model ? { id: selection.id, model: selection.model } : null;
} catch (error) {
this._models = [];
this._error = error instanceof Error ? error.message : 'Failed to load models';
throw error;
} finally {
this._loading = false;
}
}
async select(modelId: string): Promise<void> {
if (!modelId || this._updating) {
return;
}
if (this._selectedModelId === modelId) {
return;
}
const option = this._models.find((model) => model.id === modelId);
if (!option) {
throw new Error('Selected model is not available');
}
this._updating = true;
this._error = null;
try {
this._selectedModelId = option.id;
this._selectedModelName = option.model;
this._persistedSelection.value = { id: option.id, model: option.model };
} finally {
this._updating = false;
}
}
private toDisplayName(id: string): string {
const segments = id.split(/\\|\//);
const candidate = segments.pop();
return candidate && candidate.trim().length > 0 ? candidate : id;
}
/**
* Determines which model should be selected after fetching the models list.
* Priority: current selection > persisted selection > first available model > none
*/
private determineInitialSelection(models: ModelOption[]): {
id: string | null;
model: string | null;
} {
const persisted = this._persistedSelection.value;
let nextSelectionId = this._selectedModelId ?? persisted?.id ?? null;
let nextSelectionName = this._selectedModelName ?? persisted?.model ?? null;
if (nextSelectionId) {
const match = models.find((m) => m.id === nextSelectionId);
if (match) {
nextSelectionId = match.id;
nextSelectionName = match.model;
} else if (models[0]) {
nextSelectionId = models[0].id;
nextSelectionName = models[0].model;
} else {
nextSelectionId = null;
nextSelectionName = null;
}
} else if (models[0]) {
nextSelectionId = models[0].id;
nextSelectionName = models[0].model;
}
return { id: nextSelectionId, model: nextSelectionName };
}
}
export const modelsStore = new ModelsStore();
export const modelOptions = () => modelsStore.models;
export const modelsLoading = () => modelsStore.loading;
export const modelsUpdating = () => modelsStore.updating;
export const modelsError = () => modelsStore.error;
export const selectedModelId = () => modelsStore.selectedModelId;
export const selectedModelName = () => modelsStore.selectedModelName;
export const selectedModelOption = () => modelsStore.selectedModel;
export const fetchModels = modelsStore.fetch.bind(modelsStore);
export const selectModel = modelsStore.select.bind(modelsStore);

View File

@ -0,0 +1,50 @@
import { browser } from '$app/environment';
type PersistedValue<T> = {
get value(): T;
set value(newValue: T);
};
export function persisted<T>(key: string, initialValue: T): PersistedValue<T> {
let value = initialValue;
if (browser) {
try {
const stored = localStorage.getItem(key);
if (stored !== null) {
value = JSON.parse(stored) as T;
}
} catch (error) {
console.warn(`Failed to load ${key}:`, error);
}
}
const persist = (next: T) => {
if (!browser) {
return;
}
try {
if (next === null || next === undefined) {
localStorage.removeItem(key);
return;
}
localStorage.setItem(key, JSON.stringify(next));
} catch (error) {
console.warn(`Failed to persist ${key}:`, error);
}
};
return {
get value() {
return value;
},
set value(newValue: T) {
value = newValue;
persist(newValue);
}
};
}

View File

@ -80,7 +80,8 @@ class SettingsStore {
if (!browser) return;
try {
const savedVal = JSON.parse(localStorage.getItem('config') || '{}');
const storedConfigRaw = localStorage.getItem('config');
const savedVal = JSON.parse(storedConfigRaw || '{}');
// Merge with defaults to prevent breaking changes
this.config = {

View File

@ -36,6 +36,41 @@ export interface ApiChatMessageData {
timestamp?: number;
}
export interface ApiModelDataEntry {
id: string;
object: string;
created: number;
owned_by: string;
meta?: Record<string, unknown> | null;
}
export interface ApiModelDetails {
name: string;
model: string;
modified_at?: string;
size?: string | number;
digest?: string;
type?: string;
description?: string;
tags?: string[];
capabilities?: string[];
parameters?: string;
details?: {
parent_model?: string;
format?: string;
family?: string;
families?: string[];
parameter_size?: string;
quantization_level?: string;
};
}
export interface ApiModelListResponse {
object: string;
data: ApiModelDataEntry[];
models?: ApiModelDetails[];
}
export interface ApiLlamaCppServerProps {
default_generation_settings: {
id: number;
@ -120,6 +155,7 @@ export interface ApiChatCompletionRequest {
content: string | ApiChatMessageContentPart[];
}>;
stream?: boolean;
model?: string;
// Reasoning parameters
reasoning_format?: string;
// Generation parameters
@ -150,10 +186,14 @@ export interface ApiChatCompletionRequest {
}
export interface ApiChatCompletionStreamChunk {
model?: string;
choices: Array<{
model?: string;
metadata?: { model?: string };
delta: {
content?: string;
reasoning_content?: string;
model?: string;
};
}>;
timings?: {
@ -167,10 +207,14 @@ export interface ApiChatCompletionStreamChunk {
}
export interface ApiChatCompletionResponse {
model?: string;
choices: Array<{
model?: string;
metadata?: { model?: string };
message: {
content: string;
reasoning_content?: string;
model?: string;
};
}>;
}

View File

@ -0,0 +1,11 @@
import type { ApiModelDataEntry, ApiModelDetails } from '$lib/types/api';
export interface ModelOption {
id: string;
name: string;
model: string;
description?: string;
capabilities: string[];
details?: ApiModelDetails['details'];
meta?: ApiModelDataEntry['meta'];
}

View File

@ -41,6 +41,7 @@ export interface SettingsChatServiceOptions {
// Callbacks
onChunk?: (chunk: string) => void;
onReasoningChunk?: (chunk: string) => void;
onModel?: (model: string) => void;
onComplete?: (response: string, reasoningContent?: string, timings?: ChatMessageTimings) => void;
onError?: (error: Error) => void;
}

View File

@ -0,0 +1,44 @@
import { describe, expect, it } from 'vitest';
import { isValidModelName, normalizeModelName } from './model-names';
describe('normalizeModelName', () => {
it('extracts filename from forward slash path', () => {
expect(normalizeModelName('models/model-name-1')).toBe('model-name-1');
expect(normalizeModelName('path/to/model/model-name-2')).toBe('model-name-2');
});
it('extracts filename from backslash path', () => {
expect(normalizeModelName('C\\Models\\model-name-1')).toBe('model-name-1');
expect(normalizeModelName('path\\to\\model\\model-name-2')).toBe('model-name-2');
});
it('handles mixed path separators', () => {
expect(normalizeModelName('path/to\\model/model-name-2')).toBe('model-name-2');
});
it('returns simple names as-is', () => {
expect(normalizeModelName('simple-model')).toBe('simple-model');
expect(normalizeModelName('model-name-2')).toBe('model-name-2');
});
it('trims whitespace', () => {
expect(normalizeModelName(' model-name ')).toBe('model-name');
});
it('returns empty string for empty input', () => {
expect(normalizeModelName('')).toBe('');
expect(normalizeModelName(' ')).toBe('');
});
});
describe('isValidModelName', () => {
it('returns true for valid names', () => {
expect(isValidModelName('model')).toBe(true);
expect(isValidModelName('path/to/model.bin')).toBe(true);
});
it('returns false for empty values', () => {
expect(isValidModelName('')).toBe(false);
expect(isValidModelName(' ')).toBe(false);
});
});

View File

@ -0,0 +1,39 @@
/**
* Normalizes a model name by extracting the filename from a path.
*
* Handles both forward slashes (/) and backslashes (\) as path separators.
* If the model name is just a filename (no path), returns it as-is.
*
* @param modelName - The model name or path to normalize
* @returns The normalized model name (filename only)
*
* @example
* normalizeModelName('models/llama-3.1-8b') // Returns: 'llama-3.1-8b'
* normalizeModelName('C:\\Models\\gpt-4') // Returns: 'gpt-4'
* normalizeModelName('simple-model') // Returns: 'simple-model'
* normalizeModelName(' spaced ') // Returns: 'spaced'
* normalizeModelName('') // Returns: ''
*/
export function normalizeModelName(modelName: string): string {
const trimmed = modelName.trim();
if (!trimmed) {
return '';
}
const segments = trimmed.split(/[\\/]/);
const candidate = segments.pop();
const normalized = candidate?.trim();
return normalized && normalized.length > 0 ? normalized : trimmed;
}
/**
* Validates if a model name is valid (non-empty after normalization).
*
* @param modelName - The model name to validate
* @returns true if valid, false otherwise
*/
export function isValidModelName(modelName: string): boolean {
return normalizeModelName(modelName).length > 0;
}

View File

@ -0,0 +1,20 @@
export function portalToBody(node: HTMLElement) {
if (typeof document === 'undefined') {
return;
}
const target = document.body;
if (!target) {
return;
}
target.appendChild(node);
return {
destroy() {
if (node.parentNode === target) {
target.removeChild(node);
}
}
};
}

View File

@ -165,10 +165,10 @@
</Sidebar.Root>
<Sidebar.Trigger
class="transition-left absolute h-8 w-8 duration-200 ease-linear {sidebarOpen
class="transition-left absolute left-0 z-[900] h-8 w-8 duration-200 ease-linear {sidebarOpen
? 'md:left-[var(--sidebar-width)]'
: 'left-0'}"
style="translate: 1rem 1rem; z-index: 99999;"
: ''}"
style="translate: 1rem 1rem;"
/>
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">

Some files were not shown because too many files have changed in this diff Show More