Last active
October 22, 2020 08:23
-
-
Save randompast/73b23a7d2560305be8bddb3a2b9f3a53 to your computer and use it in GitHub Desktop.
Third order convolution. Mini sanity check with benchmark: cusignal vs njit vs numba.cuda
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (cusignal-dev) ~/Desktop/code/cs_fork/cusignal$ python3 discourse_2.py | |
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236] | |
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236] | |
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236] | |
# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236] | |
# True True True | |
# 481 -135.8448081001215 | |
# 481 -135.84480810012172 | |
# 481 -135.8448081001215 | |
# 481 -135.84480810012175 | |
# init 0.22148 0.25764 0.15601 0.18100 | |
# size 1k-ops cusig njit cuda_1 cuda_2 | |
# 10 491 0.02558 0.07133 0.12641 0.11735 | |
# 20 3848 0.02641 0.51370 0.34600 0.31686 | |
# 30 12717 0.03262 1.67938 0.88682 0.86864 | |
# 40 29504 0.04509 4.28684 1.99569 2.01400 | |
# 50 56375 0.06790 7.96369 3.92720 3.92942 | |
import time | |
from numba import cuda, njit, jit | |
import numpy as np | |
import cupy as cp | |
import cusignal as cs | |
import warnings | |
warnings.filterwarnings('ignore') | |
@cuda.jit | |
def conv_1d3o_cuda_1(x,k,y): #convolution 1 dimension 3rd order | |
n = cuda.grid(1) | |
if (0 <= n) and (n < y.size): | |
d = n+k.shape[0]-1 | |
for i in range(k.shape[0]): | |
for j in range(k.shape[1]): | |
for l in range(k.shape[2]): | |
y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l] | |
@cuda.jit | |
def conv_1d3o_cuda_2(x,k,y): #convolution 1 dimension 3rd order | |
n = cuda.grid(1) | |
stride = cuda.gridsize(1) | |
for i in range(n, y.size, stride): | |
d = n+k.shape[0]-1 | |
for i in range(k.shape[0]): | |
for j in range(k.shape[1]): | |
for l in range(k.shape[2]): | |
cuda.atomic.add( y, n, x[d-i] * x[d-j] * x[d-l] * k[i,j,l] ) | |
@njit | |
def conv_1d3o_njit(x,k,y): #convolution 1 dimension 3rd order | |
for n in range(0, y.size): | |
d = n+k.shape[0]-1 | |
for i in range(k.shape[0]): | |
for j in range(k.shape[1]): | |
for l in range(k.shape[2]): | |
y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l] | |
def conv_1d3o_cj(f,x,k): | |
xc = cuda.to_device(x) | |
kc = cuda.to_device(k) | |
device_id = cp.cuda.Device() | |
numSM = device_id.attributes["MultiProcessorCount"] | |
th = (128, ) | |
b = (numSM * 20,) | |
# y = cuda.device_array_like(x) | |
y = cp.zeros(x.size - k.shape[0] + 1) | |
f[b,th](xc, kc, y) | |
# return y.copy_to_host()[:-k.shape[0]+1] | |
return y | |
def conv_1d3o_nj(x, k): | |
y = np.zeros(x.size-k.shape[0]+1) | |
conv_1d3o_njit(x,k,y) | |
return y | |
def make_xKs(d, xsize): | |
np.random.seed(0) | |
x = np.random.uniform(-1,1,xsize) | |
k = np.random.uniform(-1,1,(d,d,d)) | |
return x,k | |
def test_time_1d_conv(n,f,x,k): | |
start = time.time() | |
for i in range(n): | |
y = f(x,k) | |
elapsed = time.time() - start | |
return y, elapsed | |
def test_time_1d_conv_cuda(n,f,x,k): | |
start = time.time() | |
for i in range(n): | |
y = conv_1d3o_cj(f,x,k) | |
elapsed = time.time() - start | |
return y, elapsed | |
def prime(): | |
args = make_xKs(2, 5) | |
n = 1 | |
y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args) | |
y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args) | |
y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args) | |
y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args) | |
return t0, t1, t2, t3 | |
def benchmark(n): | |
print("{:s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:s}".format("size", "1k-ops", "cusig", "njit", "cuda_1", "cuda_2")) | |
for d in range(10,60,10): | |
args = make_xKs(d, 500) | |
x, k = args | |
ops = k.shape[0] * k.shape[1] * k.shape[2] * (x.size - k.shape[0] + 1) // 1000 | |
y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args) | |
y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args) | |
y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args) | |
y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args) | |
print("{:4d}\t{:4d}\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(d, ops, t0, t1, t2, t3)) | |
def check_simple(): | |
args = x, k = np.arange(5), np.arange(8).reshape(2,2,2) | |
args = x, k = make_xKs(4, 8) | |
y0 = cs.convolve1d3o(*args) | |
y1 = conv_1d3o_nj(*args) | |
y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args) | |
y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args) | |
# print(args[0]) | |
# print(args[1]) | |
print(y0) | |
print(y1) | |
print(y2) | |
print(y3) | |
c1 = np.all( np.isclose(y0, y1) ) | |
c2 = np.all( np.isclose(y0, y2) ) | |
c3 = np.all( np.isclose(y0, y3) ) | |
print(c1, c2, c3) | |
def check_consistancy(): | |
args = make_xKs(20, 500) | |
y0 = cs.convolve1d3o(*args) | |
y1 = conv_1d3o_nj(*args) | |
y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args) | |
y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args) | |
print(y0.size, np.sum(y0)) | |
print(y1.size, np.sum(y1)) | |
print(y2.size, np.sum(y2)) | |
print(y3.size, np.sum(y3)) | |
def check_sanity(): | |
t0, t1 ,t2, t3 = prime() | |
check_simple() | |
check_consistancy() | |
print("init\t\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(t0, t1, t2, t3)) | |
if __name__ == '__main__': | |
check_sanity() | |
benchmark(100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment