diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index c562960619..5ae2f14f7e 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -35,10 +35,6 @@ if (CUDAToolkit_FOUND) if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8") list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real) endif() - - if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND CMAKE_CUDA_ARCHITECTURES 100-real) - endif() endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index b8b1d9aefd..9b82247e07 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -806,7 +806,6 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr const block_mxfp4 * bxi = (const block_mxfp4 *) x + kbx0 + i * stride + kbx; - // Load 16 bytes more efficiently using memcpy (compiler optimizes to vector loads) int aux_q4[4]; memcpy(aux_q4, bxi->qs, 16);