Skip to content

Instantly share code, notes, and snippets.

@chengscott
Created February 16, 2024 00:23

Revisions

  1. chengscott created this gist Feb 16, 2024.
    73 changes: 73 additions & 0 deletions scan.cu
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,73 @@
    #include <iostream>

    __device__ int warpInclusiveScan(int val) {
    int laneId = threadIdx.x % warpSize;
    for (int offset = 1; offset < 32; offset <<= 1) {
    int v = __shfl_up_sync(0xffffffff, val, offset);
    if (laneId >= offset) val += v;
    }
    return val;
    }

    __global__ void inclusivePrefixSumKernel(int *input, int *output, int n) {
    __shared__ int warpSums[32];
    int tx = threadIdx.x;
    int warpId = threadIdx.x / warpSize;
    int laneId = threadIdx.x % warpSize;

    // intra-warp
    int v = warpInclusiveScan(tx < n ? input[tx] : 0);
    if (laneId == 31) {
    warpSums[warpId] = v;
    }
    __syncthreads();

    // inter-warp
    if (warpId == 0) {
    int is_active = laneId < (blockDim.x / warpSize);
    int warpSum = warpInclusiveScan(is_active ? warpSums[laneId] : 0);
    if (is_active)
    warpSums[laneId] = warpSum;
    }
    __syncthreads();

    // done
    if (warpId > 0) {
    v += warpSums[warpId - 1];
    }
    if (tx < n) {
    output[tx] = v;
    }
    }

    int main() {
    const int n = 1024;
    int h_input[n], h_output[n];

    // Initialize input array
    for (int i = 0; i < n; ++i) {
    h_input[i] = 1;
    }

    int *d_input, *d_output;
    cudaMalloc(&d_input, n * sizeof(int));
    cudaMalloc(&d_output, n * sizeof(int));

    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);

    // Execute kernel
    inclusivePrefixSumKernel<<<1, n>>>(d_input, d_output, n);

    cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < n; ++i) {
    std::cout << h_output[i] << " ";
    }
    std::cout << std::endl;

    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
    }