Skip to content

Instantly share code, notes, and snippets.

@luoyetx
Created October 24, 2016 15:44
Show Gist options
  • Save luoyetx/36377dfb759c1fbab00e09a874d6f84a to your computer and use it in GitHub Desktop.
Save luoyetx/36377dfb759c1fbab00e09a874d6f84a to your computer and use it in GitHub Desktop.
simple Reduce && Scan in CUDA
__global__ void reduce_kernel(float* d_in, int n, float* d_out) {
extern __shared__ float shared_mem[];
int tid = threadIdx.x;
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < len) {
shared_mem[tid] = d_in[i];
}
else {
shared_mem[tid] = 0;
}
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
shared_mem[tid] += shared_mem[tid + s];
}
__syncthreads();
}
if (tid == 0) {
d_out[blockIdx.x] = shared_mem[0];
}
}
__global__ void scan_kernel(int* d_in, int n) {
int tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid >= n) return;
for (unsigned int s = 1; s <= n; s *= 2) {
int pos = tid - s;
int val;
if (pos >= 0) {
val = d_in[pos];
}
__syncthreads();
if (pos >= 0) {
d_in[tid] += val;
}
__syncthreads();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment