memo: CUDA

Introducing CUDA UnBound (CUB) - Microway

InclusiveSum

(2024-01-27)

1
2


d_in:  [8 , 6  , 7  , 5  , 3  , 0  , 9  ]
d_out: [8 , 14 , 21 , 26 , 29 , 29 , 38 ]

Example from 3DGS

1
2
3
4
5
6


cub::DeviceScan::InclusiveSum(
        nullptr,        // No memory is allocated
        geom.scan_size,     // num of bytes needed to reserve
        geom.tiles_touched, // input sequence
        geom.tiles_touched, // becomes prefix sums
        P)  // number of items in the sequence, 100K points

Given a block of P bytes of memory pointed to by geom.tiles_touched, obtain geom.scan_size indicating the number of bytes for temporay storage required to reserve for executing the function:

SortPairs

Docs

(2024-02-03)

Sort key-value pairs according to keys

Example in diff-rast-gass

1

cub::DeviceRadixSort::SortPairs

(2024-03-05)

Example in simple-KNN of 3DGS

Clustering points (indices) based on morton codes morton:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21


// Determine the number of bytes for temp_storage
cub::DeviceRadixSort::SortPairs(
    nullptr, 
    temp_storage_bytes, 
    morton.data().get(),  // in: keys
    morton_sorted.data().get(),  // out: sorted keys
    indices.data().get(), // in: values
    indices_sorted.data().get(), // out: sorted values
    P);
// Allocate memory for temp_storage
temp_storage.resize(temp_storage_bytes);

// Sort the pairs based on keys
cub::DeviceRadixSort::SortPairs(
    temp_storage.data().get(), 
    temp_storage_bytes, 
    morton.data().get(), 
    morton_sorted.data().get(), 
    indices.data().get(), 
    indices_sorted.data().get(), 
    P);

Reduce

min

(2024-02-28)

To execute this function, temp_storage_bytes is required to be reserved. Therefore, this function needs to run twice:

Get the number of required temporary bytes, with specifying d_temp_storage as nullptr;
Execute the function again given the correct address d_temp_storage

A simplest code: “cub_reduce.cu”

Full code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51


#include <cub/cub.cuh>
// CustomMin functor
struct CustomMin
{
    template <typename T>
    __device__ __forceinline__
    T operator()(const T &a, const T &b) const {
        return (b < a) ? b : a;
    }
};

// Declare, allocate, and initialize device-accessible pointers for
// input and output
int          N = 7; // num of items
int          d_in[7] = {8, 6, 7, 5, 3, 1, 9};
int          d_out[1] = {0};    // the min value
CustomMin    min_op;
int          init = INT_MAX;    // To be compared initially

int main(){
    int *in, *out;
    cudaMalloc(&in, sizeof(int)*N);
    cudaMalloc(&out, sizeof(int));
    cudaMemcpy(in, d_in, sizeof(int)*N, cudaMemcpyHostToDevice);

    // Determine temporary device storage requirements
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;

    cub::DeviceReduce::Reduce(
      d_temp_storage, temp_storage_bytes,
      in, out, N, min_op, init);
    cudaDeviceSynchronize();

    // Allocate temporary storage
    cudaMalloc(&d_temp_storage, temp_storage_bytes);

    // Run reduction
    cub::DeviceReduce::Reduce(
      d_temp_storage, temp_storage_bytes,
      in, out, N, min_op, init);
    cudaDeviceSynchronize();

    int min;
    cudaMemcpy(&min, out, sizeof(int), cudaMemcpyDeviceToHost);
    printf("The min is: %d\n", min); // 1

    // Free allocated memory
    cudaFree(d_temp_storage);
    return 0;
}

Notes:

If the suffix is .cpp and when compiling with nvcc cub_reduce.cpp, there will be errors about Debug:

1
2
3
4
5
6


/usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/block/specializations/../../block/../util_debug.cuh:101:43: error: ‘Debug’ is not a member of ‘cub’
  101 |     #define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)
      |                                           ^~~~~
/usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/util_allocator.cuh:695:17: note: in expansion of macro ‘CubDebug’
  695 |             if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
      |                 ^~~~~~~~

__host__ __forceline function cannot be called, need __device__

1
2
3
4
5
6


/usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/device/dispatch/dispatch_reduce.cuh(145): 
error: calling a __host__ function("T1 CustomMin::operator ()<int> (const T1 &, const T1 &) const") from a __global__ function(
"cub::DeviceReduceSingleTileKernel< ::cub::DeviceReducePolicy<int, int, int,     ::CustomMin> ::Policy600, int *, int *, int,     ::CustomMin, int> ") is not allowed

/usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/device/dispatch/dispatch_reduce.cuh(145): 
error: identifier "CustomMin::operator ()<int>  const" is undefined in device code

The customize min_op (or CustomMin()) can be changed to cub::Sum() to solve the sum of input data.

Ref:

cub::DeviceReduce — CUB 104.0 documentation
CUDA高性能计算经典问题（一）—— 归约（Reduction） - Will Zhang的文章 - 知乎
CUDA编程模型系列八(原子操作 / 规约 / 向量元素求和) - 扫地的小何尚 (何琨) Found by DDG searching “cuda cub 例子”

sum

(2024-03-04)

Move the input data onto GPU first! Otherwise, it won’t be processed and the output variable won’t update and remains the initialized value, e.g., 0.
Move the output data back to host for printing! Otherwise, the gpu memory is not allowed to access and return segment fault.

Full code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


#include <cub/cub.cuh>

int  N=7;   // num of items
int  d_in[7] = {8, 6, 7, 5, 3, 0, 9};
int  d_out[1]={0};

int main(){
    int *in, *out;
    cudaMalloc(&in, sizeof(int)*N); // allocate memory
    cudaMalloc(&out, sizeof(int));

    cudaMemcpy(in, d_in, sizeof(int)*N, cudaMemcpyHostToDevice);

    // Determine temporary device storage requirements
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;
    cub::DeviceReduce::Sum(
      d_temp_storage, temp_storage_bytes, in, out, N);

    // Allocate temporary storage
    cudaMalloc(&d_temp_storage, temp_storage_bytes);

    // Run sum-reduction
    cub::DeviceReduce::Sum(
       d_temp_storage, temp_storage_bytes, in, out, N);

    int sum=0;
    cudaMemcpy(&sum, out, sizeof(int), cudaMemcpyDeviceToHost);
    printf("%d\n", sum);
}

Ref:

Docs-Snippest
Example Code Reference by this SO question: Sum reduction with CUB surfaced by DDG with searching: “cub::DeviceReduce::Reduce example tutorial”

memo: CUDA | CUB

A configurable C++ template library of high-performance CUDA primitives

InclusiveSum

SortPairs

Reduce

min

sum