diff --git a/compression/nuq-inl.h b/compression/nuq-inl.h index 997bb5b..ce387f9 100644 --- a/compression/nuq-inl.h +++ b/compression/nuq-inl.h @@ -480,9 +480,12 @@ class NibbleCodec { static_assert(kHalf <= 1); const size_t N = hn::Lanes(d8); constexpr size_t kMaxN = hn::MaxLanes(d8); + constexpr bool kPermuteAcrossBlocks = + HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86; // For kHalf=1 and 512-bit vectors, kAdd would be 16, which is out of // bounds for TableLookupBytes. We instead BroadcastBlock<1> there. - constexpr uint8_t kAdd = kMaxN < 64 ? kHalf * kMaxN / 4 : 0; + constexpr uint8_t kAdd = + kMaxN < 64 || kPermuteAcrossBlocks ? kHalf * kMaxN / 4 : 0; // The only performance-portable op to replicate bytes is TableLookupBytes, // but this only works if vectors are 128-bit or we first BroadcastBlock, // which only works for <= 512-bit vectors. For scalable vectors, we @@ -506,7 +509,7 @@ class NibbleCodec { } else if constexpr (kMaxN <= 16) { // <= 128-bit // No BroadcastBlock, we anyway only have one block. return hn::TableLookupBytes(bytes, hn::Load(d8, kRep4)); - } else if constexpr (HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86) { + } else if constexpr (kPermuteAcrossBlocks) { // No BroadcastBlock, can directly permute across blocks. return hn::TableLookupLanes(bytes, hn::SetTableIndices(d8, kRep4)); } else { // 256..512-bit, no efficient TableLookupLanes