CUDA: Limit DeviceSegmentedSort to immediate mode (#21718)
* CUDA: Limit DeviceSegmentedSort to immediate mode DeviceSegmentedSort is currently not capturable in a cuda graph. Hence, we have to go for the slower DeviceSegmentedRadixSort in that case. Perf numbers on RTX Pro 6000 Blackwell Max-Q: DeviceSegmentedRadixSort in graph mode (i.e. CUDA Graphs) ARGSORT(type=f32,ne=[2048,512,1,1],order=1): 12291 runs - 105.94 us/run - 8192 kB/run - 73.75 GB/s ARGSORT(type=f32,ne=[4096,512,1,1],order=1): 10245 runs - 115.08 us/run - 16384 kB/run - 135.77 GB/s ARGSORT(type=f32,ne=[8192,512,1,1],order=1): 5125 runs - 221.22 us/run - 32768 kB/run - 141.26 GB/s ARGSORT(type=f32,ne=[16384,512,1,1],order=1): 2565 runs - 430.98 us/run - 65536 kB/run - 145.02 GB/s ARGSORT(type=f32,ne=[32768,512,1,1],order=1): 1028 runs - 1185.83 us/run - 131072 kB/run - 105.41 GB/s ARGSORT(type=f32,ne=[65536,512,1,1],order=1): 387 runs - 2748.62 us/run - 262144 kB/run - 90.95 GB/s DeviceSegmentedSort in immediate mode ARGSORT(type=f32,ne=[2048,512,1,1],order=1): 16388 runs - 71.17 us/run - 8192 kB/run - 109.78 GB/s ARGSORT(type=f32,ne=[4096,512,1,1],order=1): 12294 runs - 81.38 us/run - 16384 kB/run - 192.00 GB/s ARGSORT(type=f32,ne=[8192,512,1,1],order=1): 5125 runs - 240.81 us/run - 32768 kB/run - 129.77 GB/s ARGSORT(type=f32,ne=[16384,512,1,1],order=1): 2565 runs - 406.60 us/run - 65536 kB/run - 153.71 GB/s ARGSORT(type=f32,ne=[32768,512,1,1],order=1): 1285 runs - 873.23 us/run - 131072 kB/run - 143.15 GB/s ARGSORT(type=f32,ne=[65536,512,1,1],order=1): 516 runs - 2288.46 us/run - 262144 kB/run - 109.24 GB/s * Add test case for dispatch to DeviceSegmentedRadixSort We currently lack a way to force graph mode in CUDA, patch callback to invoke ggml_backend_compare_graph_backend twice to enforce each test to run in graph mode
This commit is contained in:
parent
920b3e78cb
commit
9f5e1edb10
|
|
@ -58,26 +58,48 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
|
||||
size_t temp_storage_bytes = 0;
|
||||
|
||||
bool is_capturing = false;
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
// Currently (confirmed for CCCL <= 3.2) DeviceSegmentedSort does not support stream capture, while DeviceSegmentedRadixSort does.
|
||||
// See https://github.com/NVIDIA/cccl/issues/5661#issuecomment-3229037149
|
||||
// TODO: constrain this to the CCCL versions that have this issue once it's resolved in a future CCCL release.
|
||||
cudaStreamCaptureStatus capture_status;
|
||||
CUDA_CHECK(cudaStreamIsCapturing(stream, &capture_status));
|
||||
is_capturing = (capture_status != cudaStreamCaptureStatusNone);
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
if (nrows == 1) {
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
} else if (is_capturing) {
|
||||
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairs(
|
||||
nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols * nrows, nrows, // num items, num segments
|
||||
offset_iterator, offset_iterator + 1, 0, sizeof(float) * 8, stream));
|
||||
} else {
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols * nrows, nrows, // num items, num segments
|
||||
offset_iterator, offset_iterator + 1, stream));
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys,
|
||||
temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols * nrows, nrows, // num items, num segments
|
||||
offset_iterator, offset_iterator + 1, stream));
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys,
|
||||
temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
} else if (is_capturing) {
|
||||
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairsDescending(
|
||||
nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols * nrows, nrows,
|
||||
offset_iterator, offset_iterator + 1, 0, sizeof(float) * 8, stream));
|
||||
} else {
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
|
||||
dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
|
||||
stream));
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows,
|
||||
offset_iterator, offset_iterator + 1, stream));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -86,22 +108,33 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
|
|||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
if (nrows == 1) {
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys,
|
||||
temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
} else if (is_capturing) {
|
||||
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
|
||||
offset_iterator + 1, 0, sizeof(float) * 8, stream));
|
||||
} else {
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
|
||||
ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream));
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
|
||||
offset_iterator + 1, stream));
|
||||
}
|
||||
} else {
|
||||
if (nrows == 1) {
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
CUDA_CHECK(DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys,
|
||||
temp_keys, // keys (in-place)
|
||||
temp_indices, dst, // values (indices)
|
||||
ncols, 0, sizeof(float) * 8, stream));
|
||||
} else if (is_capturing) {
|
||||
CUDA_CHECK(DeviceSegmentedRadixSort::SortPairsDescending(
|
||||
d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, ncols * nrows, nrows,
|
||||
offset_iterator, offset_iterator + 1, 0, sizeof(float) * 8, stream));
|
||||
} else {
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
|
||||
temp_indices, dst, ncols * nrows, nrows, offset_iterator,
|
||||
offset_iterator + 1, stream));
|
||||
CUDA_CHECK(DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys,
|
||||
temp_keys, temp_indices, dst, ncols * nrows, nrows,
|
||||
offset_iterator, offset_iterator + 1, stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8397,6 +8397,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 2, 1, 3}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2049, 2, 1, 3}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2, 8, 8192, 1}, order)); // bailingmoe2 (group selection)
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {2048, 512, 1, 1}, order)); // test CUDA dispatching to radix sort for nrows > = 1 in graph mode
|
||||
}
|
||||
|
||||
for (int n = 1; n < 5; ++n) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue