Created
October 24, 2016 15:44
-
-
Save luoyetx/36377dfb759c1fbab00e09a874d6f84a to your computer and use it in GitHub Desktop.
simple Reduce && Scan in CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__global__ void reduce_kernel(float* d_in, int n, float* d_out) { | |
extern __shared__ float shared_mem[]; | |
int tid = threadIdx.x; | |
int i = threadIdx.x + blockDim.x * blockIdx.x; | |
if (i < len) { | |
shared_mem[tid] = d_in[i]; | |
} | |
else { | |
shared_mem[tid] = 0; | |
} | |
__syncthreads(); | |
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { | |
if (tid < s) { | |
shared_mem[tid] += shared_mem[tid + s]; | |
} | |
__syncthreads(); | |
} | |
if (tid == 0) { | |
d_out[blockIdx.x] = shared_mem[0]; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__global__ void scan_kernel(int* d_in, int n) { | |
int tid = threadIdx.x + blockDim.x*blockIdx.x; | |
if (tid >= n) return; | |
for (unsigned int s = 1; s <= n; s *= 2) { | |
int pos = tid - s; | |
int val; | |
if (pos >= 0) { | |
val = d_in[pos]; | |
} | |
__syncthreads(); | |
if (pos >= 0) { | |
d_in[tid] += val; | |
} | |
__syncthreads(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment