randompast · October 22, 2020 08:23
diff --git a/cusignal1d3o_comparison.py b/cusignal1d3o_comparison.py
 # (cusignal-dev) ~/Desktop/code/cs_fork/cusignal$ python3 discourse_2.py 
 # [-0.06169884 -0.07237074  0.08575766 -0.03845544  0.37396236]
 # [-0.06169884 -0.07237074  0.08575766 -0.03845544  0.37396236]
 # [-0.06169884 -0.07237074  0.08575766 -0.03845544  0.37396236]
 # [-0.06169884 -0.07237074  0.08575766 -0.03845544  0.37396236]
 # True True True
 # 481 -135.8448081001215
 # 481 -135.84480810012172
 # 481 -135.8448081001215
 # 481 -135.84480810012175
 # init		    0.22148	0.25764	0.15601	0.18100
 # size	1k-ops	cusig	njit	cuda_1	cuda_2
 #   10	 491	0.02558	0.07133	0.12641	0.11735
 #   20	3848	0.02641	0.51370	0.34600	0.31686
 #   30	12717	0.03262	1.67938	0.88682	0.86864
 #   40	29504	0.04509	4.28684	1.99569	2.01400
 #   50	56375	0.06790	7.96369	3.92720	3.92942


 import time
 from numba import cuda, njit, jit
 import numpy as np
 import cupy as cp
 import cusignal as cs

 import warnings
 warnings.filterwarnings('ignore')

 @cuda.jit
 def conv_1d3o_cuda_1(x,k,y): #convolution 1 dimension 3rd order
    n = cuda.grid(1)
    if (0 <= n) and (n < y.size):
        d = n+k.shape[0]-1
        for i in range(k.shape[0]):
            for j in range(k.shape[1]):
                for l in range(k.shape[2]):
                    y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l]

 @cuda.jit
 def conv_1d3o_cuda_2(x,k,y): #convolution 1 dimension 3rd order
    n = cuda.grid(1)
    stride = cuda.gridsize(1)
    for i in range(n, y.size, stride):
        d = n+k.shape[0]-1
        for i in range(k.shape[0]):
            for j in range(k.shape[1]):
                for l in range(k.shape[2]):
                    cuda.atomic.add( y, n, x[d-i] * x[d-j] * x[d-l] * k[i,j,l] )

 @njit
 def conv_1d3o_njit(x,k,y): #convolution 1 dimension 3rd order
    for n in range(0, y.size):
        d = n+k.shape[0]-1
        for i in range(k.shape[0]):
            for j in range(k.shape[1]):
                for l in range(k.shape[2]):
                    y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l]


 def conv_1d3o_cj(f,x,k):
    xc = cuda.to_device(x)
    kc = cuda.to_device(k)
    device_id = cp.cuda.Device()
    numSM = device_id.attributes["MultiProcessorCount"]
    th = (128, )
    b = (numSM * 20,)
    # y = cuda.device_array_like(x)
    y = cp.zeros(x.size - k.shape[0] + 1)
    f[b,th](xc, kc, y)
    # return y.copy_to_host()[:-k.shape[0]+1]
    return y

 def conv_1d3o_nj(x, k):
    y = np.zeros(x.size-k.shape[0]+1)
    conv_1d3o_njit(x,k,y)
    return y

 def make_xKs(d, xsize):
    np.random.seed(0)
    x = np.random.uniform(-1,1,xsize)
    k = np.random.uniform(-1,1,(d,d,d))
    return x,k

 def test_time_1d_conv(n,f,x,k):
    start = time.time()
    for i in range(n):
        y = f(x,k)
    elapsed = time.time() - start
    return y, elapsed

 def test_time_1d_conv_cuda(n,f,x,k):
    start = time.time()
    for i in range(n):
        y = conv_1d3o_cj(f,x,k)
    elapsed = time.time() - start
    return y, elapsed

 def prime():
    args = make_xKs(2, 5)
    n = 1
    y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args)
    y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args)
    y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args)
    y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args)
    return t0, t1, t2, t3

 def benchmark(n):
    print("{:s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:s}".format("size", "1k-ops", "cusig", "njit", "cuda_1", "cuda_2"))
    for d in range(10,60,10):
        args = make_xKs(d, 500)
        x, k = args
        ops = k.shape[0] * k.shape[1] * k.shape[2] * (x.size - k.shape[0] + 1) // 1000
        y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args)
        y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args)
        y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args)
        y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args)
        print("{:4d}\t{:4d}\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(d, ops, t0, t1, t2, t3))

 def check_simple():
    args = x, k = np.arange(5), np.arange(8).reshape(2,2,2)
    args = x, k = make_xKs(4, 8)
    y0 = cs.convolve1d3o(*args)
    y1 = conv_1d3o_nj(*args)
    y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args)
    y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args)
    # print(args[0])
    # print(args[1])
    print(y0)
    print(y1)
    print(y2)
    print(y3)
    c1 = np.all( np.isclose(y0, y1) )
    c2 = np.all( np.isclose(y0, y2) )
    c3 = np.all( np.isclose(y0, y3) )
    print(c1, c2, c3)

 def check_consistancy():
    args = make_xKs(20, 500)
    y0 = cs.convolve1d3o(*args)
    y1 = conv_1d3o_nj(*args)
    y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args)
    y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args)
    print(y0.size, np.sum(y0))
    print(y1.size, np.sum(y1))
    print(y2.size, np.sum(y2))
    print(y3.size, np.sum(y3))

 def check_sanity():
    t0, t1 ,t2, t3 = prime()
    check_simple()
    check_consistancy()
    print("init\t\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(t0, t1, t2, t3))
    
 if __name__ == '__main__':
    check_sanity()
    benchmark(100)
	# (cusignal-dev) ~/Desktop/code/cs_fork/cusignal$ python3 discourse_2.py
	# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
	# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
	# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
	# [-0.06169884 -0.07237074 0.08575766 -0.03845544 0.37396236]
	# True True True
	# 481 -135.8448081001215
	# 481 -135.84480810012172
	# 481 -135.8448081001215
	# 481 -135.84480810012175
	# init 0.22148 0.25764 0.15601 0.18100
	# size 1k-ops cusig njit cuda_1 cuda_2
	# 10 491 0.02558 0.07133 0.12641 0.11735
	# 20 3848 0.02641 0.51370 0.34600 0.31686
	# 30 12717 0.03262 1.67938 0.88682 0.86864
	# 40 29504 0.04509 4.28684 1.99569 2.01400
	# 50 56375 0.06790 7.96369 3.92720 3.92942


	import time
	from numba import cuda, njit, jit
	import numpy as np
	import cupy as cp
	import cusignal as cs

	import warnings
	warnings.filterwarnings('ignore')

	@cuda.jit
	def conv_1d3o_cuda_1(x,k,y): #convolution 1 dimension 3rd order
	n = cuda.grid(1)
	if (0 <= n) and (n < y.size):
	d = n+k.shape[0]-1
	for i in range(k.shape[0]):
	for j in range(k.shape[1]):
	for l in range(k.shape[2]):
	y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l]

	@cuda.jit
	def conv_1d3o_cuda_2(x,k,y): #convolution 1 dimension 3rd order
	n = cuda.grid(1)
	stride = cuda.gridsize(1)
	for i in range(n, y.size, stride):
	d = n+k.shape[0]-1
	for i in range(k.shape[0]):
	for j in range(k.shape[1]):
	for l in range(k.shape[2]):
	cuda.atomic.add( y, n, x[d-i] * x[d-j] * x[d-l] * k[i,j,l] )

	@njit
	def conv_1d3o_njit(x,k,y): #convolution 1 dimension 3rd order
	for n in range(0, y.size):
	d = n+k.shape[0]-1
	for i in range(k.shape[0]):
	for j in range(k.shape[1]):
	for l in range(k.shape[2]):
	y[n] += x[d-i] * x[d-j] * x[d-l] * k[i,j,l]


	def conv_1d3o_cj(f,x,k):
	xc = cuda.to_device(x)
	kc = cuda.to_device(k)
	device_id = cp.cuda.Device()
	numSM = device_id.attributes["MultiProcessorCount"]
	th = (128, )
	b = (numSM * 20,)
	# y = cuda.device_array_like(x)
	y = cp.zeros(x.size - k.shape[0] + 1)
	f[b,th](xc, kc, y)
	# return y.copy_to_host()[:-k.shape[0]+1]
	return y

	def conv_1d3o_nj(x, k):
	y = np.zeros(x.size-k.shape[0]+1)
	conv_1d3o_njit(x,k,y)
	return y

	def make_xKs(d, xsize):
	np.random.seed(0)
	x = np.random.uniform(-1,1,xsize)
	k = np.random.uniform(-1,1,(d,d,d))
	return x,k

	def test_time_1d_conv(n,f,x,k):
	start = time.time()
	for i in range(n):
	y = f(x,k)
	elapsed = time.time() - start
	return y, elapsed

	def test_time_1d_conv_cuda(n,f,x,k):
	start = time.time()
	for i in range(n):
	y = conv_1d3o_cj(f,x,k)
	elapsed = time.time() - start
	return y, elapsed

	def prime():
	args = make_xKs(2, 5)
	n = 1
	y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args)
	y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args)
	y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args)
	y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args)
	return t0, t1, t2, t3

	def benchmark(n):
	print("{:s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:s}".format("size", "1k-ops", "cusig", "njit", "cuda_1", "cuda_2"))
	for d in range(10,60,10):
	args = make_xKs(d, 500)
	x, k = args
	ops = k.shape[0] * k.shape[1] * k.shape[2] * (x.size - k.shape[0] + 1) // 1000
	y0, t0 = test_time_1d_conv(n, cs.convolve1d3o, *args)
	y1, t1 = test_time_1d_conv(n, conv_1d3o_nj, *args)
	y2, t2 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_1, *args)
	y3, t3 = test_time_1d_conv_cuda(n, conv_1d3o_cuda_2, *args)
	print("{:4d}\t{:4d}\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(d, ops, t0, t1, t2, t3))

	def check_simple():
	args = x, k = np.arange(5), np.arange(8).reshape(2,2,2)
	args = x, k = make_xKs(4, 8)
	y0 = cs.convolve1d3o(*args)
	y1 = conv_1d3o_nj(*args)
	y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args)
	y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args)
	# print(args[0])
	# print(args[1])
	print(y0)
	print(y1)
	print(y2)
	print(y3)
	c1 = np.all( np.isclose(y0, y1) )
	c2 = np.all( np.isclose(y0, y2) )
	c3 = np.all( np.isclose(y0, y3) )
	print(c1, c2, c3)

	def check_consistancy():
	args = make_xKs(20, 500)
	y0 = cs.convolve1d3o(*args)
	y1 = conv_1d3o_nj(*args)
	y2 = conv_1d3o_cj(conv_1d3o_cuda_1, *args)
	y3 = conv_1d3o_cj(conv_1d3o_cuda_2, *args)
	print(y0.size, np.sum(y0))
	print(y1.size, np.sum(y1))
	print(y2.size, np.sum(y2))
	print(y3.size, np.sum(y3))

	def check_sanity():
	t0, t1 ,t2, t3 = prime()
	check_simple()
	check_consistancy()
	print("init\t\t{:0.5f}\t{:0.5f}\t{:0.5f}\t{:0.5f}".format(t0, t1, t2, t3))

	if __name__ == '__main__':
	check_sanity()
	benchmark(100)