memo: CUDA | CUB

A configurable C++ template library of high-performance CUDA primitives

Introducing CUDA UnBound (CUB) - Microway

InclusiveSum

Docs

(2024-01-27)

1
2
d_in:  [8 , 6  , 7  , 5  , 3  , 0  , 9  ]
d_out: [8 , 14 , 21 , 26 , 29 , 29 , 38 ]
  1. Example from 3DGS

    1
    2
    3
    4
    5
    6
    
    cub::DeviceScan::InclusiveSum(
            nullptr,        // No memory is allocated
            geom.scan_size,     // num of bytes needed to reserve
            geom.tiles_touched, // input sequence
            geom.tiles_touched, // becomes prefix sums
            P)  // number of items in the sequence, 100K points
    
    • Given a block of P bytes of memory pointed to by geom.tiles_touched, obtain geom.scan_size indicating the number of bytes for temporay storage required to reserve for executing the function:

SortPairs

Docs

(2024-02-03)

Sort key-value pairs according to keys

Example in diff-rast-gass

1
cub::DeviceRadixSort::SortPairs

(2024-03-05)

Example in simple-KNN of 3DGS

Clustering points (indices) based on morton codes morton:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
// Determine the number of bytes for temp_storage
cub::DeviceRadixSort::SortPairs(
    nullptr, 
    temp_storage_bytes, 
    morton.data().get(),  // in: keys
    morton_sorted.data().get(),  // out: sorted keys
    indices.data().get(), // in: values
    indices_sorted.data().get(), // out: sorted values
    P);
// Allocate memory for temp_storage
temp_storage.resize(temp_storage_bytes);

// Sort the pairs based on keys
cub::DeviceRadixSort::SortPairs(
    temp_storage.data().get(), 
    temp_storage_bytes, 
    morton.data().get(), 
    morton_sorted.data().get(), 
    indices.data().get(), 
    indices_sorted.data().get(), 
    P);

Reduce

min

(2024-02-28)

To execute this function, temp_storage_bytes is required to be reserved. Therefore, this function needs to run twice:

  1. Get the number of required temporary bytes, with specifying d_temp_storage as nullptr;

  2. Execute the function again given the correct address d_temp_storage

A simplest code: “cub_reduce.cu”

Full code:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#include <cub/cub.cuh>
// CustomMin functor
struct CustomMin
{
    template <typename T>
    __device__ __forceinline__
    T operator()(const T &a, const T &b) const {
        return (b < a) ? b : a;
    }
};

// Declare, allocate, and initialize device-accessible pointers for
// input and output
int          N = 7; // num of items
int          d_in[7] = {8, 6, 7, 5, 3, 1, 9};
int          d_out[1] = {0};    // the min value
CustomMin    min_op;
int          init = INT_MAX;    // To be compared initially

int main(){
    int *in, *out;
    cudaMalloc(&in, sizeof(int)*N);
    cudaMalloc(&out, sizeof(int));
    cudaMemcpy(in, d_in, sizeof(int)*N, cudaMemcpyHostToDevice);

    // Determine temporary device storage requirements
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;

    cub::DeviceReduce::Reduce(
      d_temp_storage, temp_storage_bytes,
      in, out, N, min_op, init);
    cudaDeviceSynchronize();

    // Allocate temporary storage
    cudaMalloc(&d_temp_storage, temp_storage_bytes);

    // Run reduction
    cub::DeviceReduce::Reduce(
      d_temp_storage, temp_storage_bytes,
      in, out, N, min_op, init);
    cudaDeviceSynchronize();

    int min;
    cudaMemcpy(&min, out, sizeof(int), cudaMemcpyDeviceToHost);
    printf("The min is: %d\n", min); // 1

    // Free allocated memory
    cudaFree(d_temp_storage);
    return 0;
}

Notes:

  1. If the suffix is .cpp and when compiling with nvcc cub_reduce.cpp, there will be errors about Debug:

    1
    2
    3
    4
    5
    6
    
    /usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/block/specializations/../../block/../util_debug.cuh:101:43: error: ‘Debug’ is not a member of ‘cub’
      101 |     #define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)
          |                                           ^~~~~
    /usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/util_allocator.cuh:695:17: note: in expansion of macro ‘CubDebug’
      695 |             if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
          |                 ^~~~~~~~
    
  2. __host__ __forceline function cannot be called, need __device__

    1
    2
    3
    4
    5
    6
    
    /usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/device/dispatch/dispatch_reduce.cuh(145): 
    error: calling a __host__ function("T1 CustomMin::operator ()<int> (const T1 &, const T1 &) const") from a __global__ function(
    "cub::DeviceReduceSingleTileKernel< ::cub::DeviceReducePolicy<int, int, int,     ::CustomMin> ::Policy600, int *, int *, int,     ::CustomMin, int> ") is not allowed
    
    /usr/local/cuda-11.6/bin/../targets/x86_64-linux/include/cub/device/dispatch/dispatch_reduce.cuh(145): 
    error: identifier "CustomMin::operator ()<int>  const" is undefined in device code
    
  3. The customize min_op (or CustomMin()) can be changed to cub::Sum() to solve the sum of input data.

Ref:


sum

(2024-03-04)

  • Move the input data onto GPU first! Otherwise, it won’t be processed and the output variable won’t update and remains the initialized value, e.g., 0.

  • Move the output data back to host for printing! Otherwise, the gpu memory is not allowed to access and return segment fault.

Full code:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#include <cub/cub.cuh>

int  N=7;   // num of items
int  d_in[7] = {8, 6, 7, 5, 3, 0, 9};
int  d_out[1]={0};

int main(){
    int *in, *out;
    cudaMalloc(&in, sizeof(int)*N); // allocate memory
    cudaMalloc(&out, sizeof(int));

    cudaMemcpy(in, d_in, sizeof(int)*N, cudaMemcpyHostToDevice);

    // Determine temporary device storage requirements
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;
    cub::DeviceReduce::Sum(
      d_temp_storage, temp_storage_bytes, in, out, N);

    // Allocate temporary storage
    cudaMalloc(&d_temp_storage, temp_storage_bytes);

    // Run sum-reduction
    cub::DeviceReduce::Sum(
       d_temp_storage, temp_storage_bytes, in, out, N);

    int sum=0;
    cudaMemcpy(&sum, out, sizeof(int), cudaMemcpyDeviceToHost);
    printf("%d\n", sum);
}

Ref: