diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9f93c70d21..7d7f20af3a 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -802,7 +802,13 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) { #ifdef FP8_AVAILABLE const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation. +#if defined(GGML_USE_HIP) && defined(CDNA3) + // ROCm dose not support fp8 in software on devices with fp8 hardware, + // but CDNA3 supports only e4m3_fnuz (no inf). + const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast(&bits); +#else const __nv_fp8_e4m3 xf = *reinterpret_cast(&bits); +#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP) return static_cast(xf) / 2; #else NO_DEVICE_CODE;