tscohen · March 2, 2016 10:45
diff --git a/theano_cuda_lerp b/theano_cuda_lerp

 import numpy as np
 import theano
 import theano.misc.pycuda_init
 from pycuda.compiler import SourceModule
 import theano.sandbox.cuda as cuda
 from theano.sandbox.cuda import GpuOp


 class LinearInterpolationCUDAOp(GpuOp):
    __props__ = ()

    #def __init__(self, inds):
    #    self.inds = inds

    def make_node(self, f, coords, wts):

        # Only float32 is supported on GPU, so automatically convert float64 to float32
        if coords.dtype == 'float64':
            coords = coords.astype('float32')
        if wts.dtype == 'float64':
            wts = wts.astype('float32')

        assert f.dtype == 'float32'
        assert coords.dtype == 'float32'
        assert wts.dtype == 'float32'

        f = cuda.basic_ops.gpu_contiguous(
           cuda.basic_ops.as_cuda_ndarray_variable(f))
        coords = cuda.basic_ops.gpu_contiguous(
           cuda.basic_ops.as_cuda_ndarray_variable(coords))
        wts = cuda.basic_ops.gpu_contiguous(
           cuda.basic_ops.as_cuda_ndarray_variable(wts))

        return theano.Apply(self, [f, coords, wts], [f.type()])

    def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule("""
            __global__ void linear_interpolation(float* f,
                                                 float* coords,
                                                 float* wts,
                                                 float* output,
                                                 int batch_size,
                                                 int out_height,
                                                 int out_width,
                                                 int im_height,
                                                 int im_width,
                                                 int size)
            {
                int i = blockIdx.x * blockDim.x + threadIdx.x;
                if(i < size) {

                    // Compute batch index, x-coordinate and y-coordinate from index i:
                    // The dimensions of output are (batch, y, x)
                    const int bi = i / (out_width * out_height);
                    const int yi = (i / out_width) % out_height;
                    const int xi = i % out_width;

                    const int x0 = coords[yi * out_width * 4 + xi * 4];
                    const int x1 = coords[yi * out_width * 4 + xi * 4 + 1];
                    const int y0 = coords[yi * out_width * 4 + xi * 4 + 2];
                    const int y1 = coords[yi * out_width * 4 + xi * 4 + 3];

                    const float w00 = wts[yi * out_width * 4 + xi * 4];
                    const float w01 = wts[yi * out_width * 4 + xi * 4 + 1];
                    const float w10 = wts[yi * out_width * 4 + xi * 4 + 2];
                    const float w11 = wts[yi * out_width * 4 + xi * 4 + 3];

                    const float f00 = f[bi * im_width * im_height + x0 * im_width + y0];
                    const float f01 = f[bi * im_width * im_height + x0 * im_width + y1];
                    const float f10 = f[bi * im_width * im_height + x1 * im_width + y0];
                    const float f11 = f[bi * im_width * im_height + x1 * im_width + y1];

                    output[i] = w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11;
                }
            }
        """)

        pycuda_fct = mod.get_function("linear_interpolation")
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
            z = outputs[0]
            outshape = (inputs[0][0].shape[0], inputs[1][0].shape[0], inputs[1][0].shape[1])
            if z[0] is None or z[0].shape != outshape:
                z[0] = cuda.CudaNdarray.zeros(outshape)
            n = 2 ** 10.
            grid = (int(np.ceil(np.prod(outshape) / n)), 1)
            pycuda_fct(inputs[0][0],
                       inputs[1][0],
                       inputs[2][0],
                       z[0],
                       np.intc(outshape[0]),
                       np.intc(outshape[1]),
                       np.intc(outshape[2]),
                       np.intc(inputs[0][0].shape[1]),
                       np.intc(inputs[0][0].shape[2]),
                       np.intc(np.prod(outshape)),
                       block=(int(n), 1, 1), grid=grid)
        thunk.lazy = False
        return thunk

    def grad(self, inputs, output_grads):
        return [LinearInterpolationGradCUDAOp()(output_grads[0], inputs[1], inputs[2]),
                theano.gradient.grad_not_implemented(self, 1, inputs[1]),
                theano.gradient.grad_not_implemented(self, 2, inputs[2])]

 def test2():
    from linear_interpolation_kernels import *

    import theano.tensor as T
    op = LinearInterpolationCUDAOp()
    x_t = T.ftensor3()
    w_t = T.ftensor3()
    i_t = T.ftensor3()
    y_t = op(x_t, i_t, w_t)
    y_f = theano.function([x_t, i_t, w_t], y_t)

    cost = T.sum(y_t)
    y_grad_t = T.grad(cost, [x_t])
    y_grad_f = theano.function([x_t, i_t, w_t], y_grad_t)

    core = (300, 400)
    batch = (32,)
    #x = np.arange(2 * 3 * 4).reshape(2, 3, 4).astype('float32')
    x = np.random.randn(*(batch + core)).astype('float32')
    w = np.random.randn(*(core + (4,))).astype('float32')
    i = np.random.randint(0, 300, size=core + (4,)).astype('float32')

    yg = y_grad_f(x, i, w)[0]
    print 'GRAD:', yg, yg.shape

    #yg2 = np.zeros(batch + core)
    #linear_interpolation_grad_precomputed(x.astype('float64'), i.astype('int64'), w.astype('float64'), yg2)

    print x.shape, i.shape, w.shape
    import time
    begin = time.time()
    for a in range(100):
        y = y_f(x, i, w)
    end = time.time()
    print 'time gpu:', end - begin

    y = np.asarray(y)
    #print np.asarray(y)
    #print 'x\n', x
    #print 'i\n', i
    #print 'w\n', w


    y2 = np.zeros(batch + core).astype('float64')
    x = x.astype('float64')
    i = i.astype('int64')
    w = w.astype('float64')

    begin = time.time()
    for a in range(100):
        linear_interpolation_precomputed(x.astype('float64'), i.astype('int64'), w.astype('float64'), y2)
    end = time.time()
    print 'time cpu', end - begin

    #print y2
    #print y
    print(np.sum(np.abs(y-y2)))
    #return y, y2

    #print np.unique(y)
    #print np.unique(y2)

    #print y - y2




 class LinearInterpolationGradCUDAOp(GpuOp):
    __props__ = ()

    def make_node(self, output_grad, coords, wts):
        output_grad = cuda.basic_ops.gpu_contiguous(
           cuda.basic_ops.as_cuda_ndarray_variable(output_grad))
        coords = cuda.basic_ops.gpu_contiguous(
           cuda.basic_ops.as_cuda_ndarray_variable(coords))
        wts = cuda.basic_ops.gpu_contiguous(
           cuda.basic_ops.as_cuda_ndarray_variable(wts))

        assert output_grad.dtype == 'float32'
        assert coords.dtype == 'float32'
        assert wts.dtype == 'float32'
        return theano.Apply(self, [output_grad, coords, wts], [output_grad.type()])

    def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule("""
            __global__ void my_fct(float* output_grad,
                                   float* coords,
                                   float* wts,
                                   float* grad_f,
                                   int batch_size,
                                   int out_height,
                                   int out_width,
                                   int im_height,
                                   int im_width,
                                   int size)
            {
                int i = blockIdx.x * blockDim.x + threadIdx.x;
                if(i < size) {

                    // Compute batch index, x-coordinate and y-coordinate from index i:
                    // The dimensions of output are (batch, y, x)
                    const int bi = i / (out_width * out_height);
                    const int yi = (i / out_width) % out_height;
                    const int xi = i % out_width;

                    const int x0 = coords[yi * out_width * 4 + xi * 4];
                    const int x1 = coords[yi * out_width * 4 + xi * 4 + 1];
                    const int y0 = coords[yi * out_width * 4 + xi * 4 + 2];
                    const int y1 = coords[yi * out_width * 4 + xi * 4 + 3];

                    const float w00 = wts[yi * out_width * 4 + xi * 4];
                    const float w01 = wts[yi * out_width * 4 + xi * 4 + 1];
                    const float w10 = wts[yi * out_width * 4 + xi * 4 + 2];
                    const float w11 = wts[yi * out_width * 4 + xi * 4 + 3];

                    // w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11;
                    const float og = output_grad[i];  // This is output_grad[bi, xi, yi]
                    atomicAdd(grad_f + bi * im_width * im_height + x0 * im_width + y0, w00 * og);
                    atomicAdd(grad_f + bi * im_width * im_height + x0 * im_width + y1, w01 * og);
                    atomicAdd(grad_f + bi * im_width * im_height + x1 * im_width + y0, w10 * og);
                    atomicAdd(grad_f + bi * im_width * im_height + x1 * im_width + y1, w11 * og);
                }
            }
        """)

        pycuda_fct = mod.get_function("my_fct")
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
            z = outputs[0]
            outshape = (inputs[0][0].shape[0], inputs[1][0].shape[0], inputs[1][0].shape[1])
            if z[0] is None or z[0].shape != outshape:
                z[0] = cuda.CudaNdarray.zeros(outshape)
            n = 1024.
            grid = (int(np.ceil(np.prod(outshape) / n)), 1)
            pycuda_fct(inputs[0][0],
                       inputs[1][0],
                       inputs[2][0],
                       z[0],
                       np.intc(outshape[0]),
                       np.intc(outshape[1]),
                       np.intc(outshape[2]),
                       np.intc(inputs[0][0].shape[1]),
                       np.intc(inputs[0][0].shape[2]),
                       np.intc(np.prod(outshape)),
                       block=(int(n), 1, 1), grid=grid)
        thunk.lazy = False
        return thunk


 def testgrad():

    import theano.tensor as T
    op = LinearInterpolationGradCUDAOp()
    og_t = T.ftensor3()
    w_t = T.ftensor3()
    i_t = T.ftensor3()
    y_t = op(og_t, i_t, w_t)
    y_f = theano.function([og_t, i_t, w_t], y_t)

    core = (300, 400)
    batch = (32,)
    #og = np.arange(2 * 3 * 4).reshape(2, 3, 4).astype('float32')
    og = np.random.randn(*(batch + core)).astype('float32')
    w = np.random.randn(*(core + (4,))).astype('float32')
    i = np.random.randint(0, 3, size=core + (4,)).astype('float32')

    print og.shape, i.shape, w.shape
    print 'x\n', og
    print 'i\n', i
    print 'w\n', w

    import time
    begin = time.time()
    for a in range(100):
        y = y_f(og, i, w)
    end = time.time()
    print 'time gpu:', end - begin
    y = np.asarray(y)
    print y

    from linear_interpolation_kernels import *
    y2 = np.zeros(batch + core).astype('float64')
    og = og.astype('float64')
    i = i.astype('int64')
    w = w.astype('float64')

    begin = time.time()
    for a in range(100):
        linear_interpolation_grad_precomputed(og.astype('float64'), i.astype('int64'), w.astype('float64'), y2)
    end = time.time()
    print 'time cpu:', end - begin
    print y2
    print(np.sum(np.abs(y-y2)))

	import numpy as np
	import theano
	import theano.misc.pycuda_init
	from pycuda.compiler import SourceModule
	import theano.sandbox.cuda as cuda
	from theano.sandbox.cuda import GpuOp


	class LinearInterpolationCUDAOp(GpuOp):
	__props__ = ()

	#def __init__(self, inds):
	# self.inds = inds

	def make_node(self, f, coords, wts):

	# Only float32 is supported on GPU, so automatically convert float64 to float32
	if coords.dtype == 'float64':
	coords = coords.astype('float32')
	if wts.dtype == 'float64':
	wts = wts.astype('float32')

	assert f.dtype == 'float32'
	assert coords.dtype == 'float32'
	assert wts.dtype == 'float32'

	f = cuda.basic_ops.gpu_contiguous(
	cuda.basic_ops.as_cuda_ndarray_variable(f))
	coords = cuda.basic_ops.gpu_contiguous(
	cuda.basic_ops.as_cuda_ndarray_variable(coords))
	wts = cuda.basic_ops.gpu_contiguous(
	cuda.basic_ops.as_cuda_ndarray_variable(wts))

	return theano.Apply(self, [f, coords, wts], [f.type()])

	def make_thunk(self, node, storage_map, _, _2):
	mod = SourceModule("""
	__global__ void linear_interpolation(float* f,
	float* coords,
	float* wts,
	float* output,
	int batch_size,
	int out_height,
	int out_width,
	int im_height,
	int im_width,
	int size)
	{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(i < size) {

	// Compute batch index, x-coordinate and y-coordinate from index i:
	// The dimensions of output are (batch, y, x)
	const int bi = i / (out_width * out_height);
	const int yi = (i / out_width) % out_height;
	const int xi = i % out_width;

	const int x0 = coords[yi * out_width * 4 + xi * 4];
	const int x1 = coords[yi * out_width * 4 + xi * 4 + 1];
	const int y0 = coords[yi * out_width * 4 + xi * 4 + 2];
	const int y1 = coords[yi * out_width * 4 + xi * 4 + 3];

	const float w00 = wts[yi * out_width * 4 + xi * 4];
	const float w01 = wts[yi * out_width * 4 + xi * 4 + 1];
	const float w10 = wts[yi * out_width * 4 + xi * 4 + 2];
	const float w11 = wts[yi * out_width * 4 + xi * 4 + 3];

	const float f00 = f[bi * im_width * im_height + x0 * im_width + y0];
	const float f01 = f[bi * im_width * im_height + x0 * im_width + y1];
	const float f10 = f[bi * im_width * im_height + x1 * im_width + y0];
	const float f11 = f[bi * im_width * im_height + x1 * im_width + y1];

	output[i] = w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11;
	}
	}
	""")

	pycuda_fct = mod.get_function("linear_interpolation")
	inputs = [storage_map[v] for v in node.inputs]
	outputs = [storage_map[v] for v in node.outputs]

	def thunk():
	z = outputs[0]
	outshape = (inputs[0][0].shape[0], inputs[1][0].shape[0], inputs[1][0].shape[1])
	if z[0] is None or z[0].shape != outshape:
	z[0] = cuda.CudaNdarray.zeros(outshape)
	n = 2 ** 10.
	grid = (int(np.ceil(np.prod(outshape) / n)), 1)
	pycuda_fct(inputs[0][0],
	inputs[1][0],
	inputs[2][0],
	z[0],
	np.intc(outshape[0]),
	np.intc(outshape[1]),
	np.intc(outshape[2]),
	np.intc(inputs[0][0].shape[1]),
	np.intc(inputs[0][0].shape[2]),
	np.intc(np.prod(outshape)),
	block=(int(n), 1, 1), grid=grid)
	thunk.lazy = False
	return thunk

	def grad(self, inputs, output_grads):
	return [LinearInterpolationGradCUDAOp()(output_grads[0], inputs[1], inputs[2]),
	theano.gradient.grad_not_implemented(self, 1, inputs[1]),
	theano.gradient.grad_not_implemented(self, 2, inputs[2])]

	def test2():
	from linear_interpolation_kernels import *

	import theano.tensor as T
	op = LinearInterpolationCUDAOp()
	x_t = T.ftensor3()
	w_t = T.ftensor3()
	i_t = T.ftensor3()
	y_t = op(x_t, i_t, w_t)
	y_f = theano.function([x_t, i_t, w_t], y_t)

	cost = T.sum(y_t)
	y_grad_t = T.grad(cost, [x_t])
	y_grad_f = theano.function([x_t, i_t, w_t], y_grad_t)

	core = (300, 400)
	batch = (32,)
	#x = np.arange(2 * 3 * 4).reshape(2, 3, 4).astype('float32')
	x = np.random.randn(*(batch + core)).astype('float32')
	w = np.random.randn(*(core + (4,))).astype('float32')
	i = np.random.randint(0, 300, size=core + (4,)).astype('float32')

	yg = y_grad_f(x, i, w)[0]
	print 'GRAD:', yg, yg.shape

	#yg2 = np.zeros(batch + core)
	#linear_interpolation_grad_precomputed(x.astype('float64'), i.astype('int64'), w.astype('float64'), yg2)

	print x.shape, i.shape, w.shape
	import time
	begin = time.time()
	for a in range(100):
	y = y_f(x, i, w)
	end = time.time()
	print 'time gpu:', end - begin

	y = np.asarray(y)
	#print np.asarray(y)
	#print 'x\n', x
	#print 'i\n', i
	#print 'w\n', w


	y2 = np.zeros(batch + core).astype('float64')
	x = x.astype('float64')
	i = i.astype('int64')
	w = w.astype('float64')

	begin = time.time()
	for a in range(100):
	linear_interpolation_precomputed(x.astype('float64'), i.astype('int64'), w.astype('float64'), y2)
	end = time.time()
	print 'time cpu', end - begin

	#print y2
	#print y
	print(np.sum(np.abs(y-y2)))
	#return y, y2

	#print np.unique(y)
	#print np.unique(y2)

	#print y - y2




	class LinearInterpolationGradCUDAOp(GpuOp):
	__props__ = ()

	def make_node(self, output_grad, coords, wts):
	output_grad = cuda.basic_ops.gpu_contiguous(
	cuda.basic_ops.as_cuda_ndarray_variable(output_grad))
	coords = cuda.basic_ops.gpu_contiguous(
	cuda.basic_ops.as_cuda_ndarray_variable(coords))
	wts = cuda.basic_ops.gpu_contiguous(
	cuda.basic_ops.as_cuda_ndarray_variable(wts))

	assert output_grad.dtype == 'float32'
	assert coords.dtype == 'float32'
	assert wts.dtype == 'float32'
	return theano.Apply(self, [output_grad, coords, wts], [output_grad.type()])

	def make_thunk(self, node, storage_map, _, _2):
	mod = SourceModule("""
	__global__ void my_fct(float* output_grad,
	float* coords,
	float* wts,
	float* grad_f,
	int batch_size,
	int out_height,
	int out_width,
	int im_height,
	int im_width,
	int size)
	{
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(i < size) {

	// Compute batch index, x-coordinate and y-coordinate from index i:
	// The dimensions of output are (batch, y, x)
	const int bi = i / (out_width * out_height);
	const int yi = (i / out_width) % out_height;
	const int xi = i % out_width;

	const int x0 = coords[yi * out_width * 4 + xi * 4];
	const int x1 = coords[yi * out_width * 4 + xi * 4 + 1];
	const int y0 = coords[yi * out_width * 4 + xi * 4 + 2];
	const int y1 = coords[yi * out_width * 4 + xi * 4 + 3];

	const float w00 = wts[yi * out_width * 4 + xi * 4];
	const float w01 = wts[yi * out_width * 4 + xi * 4 + 1];
	const float w10 = wts[yi * out_width * 4 + xi * 4 + 2];
	const float w11 = wts[yi * out_width * 4 + xi * 4 + 3];

	// w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11;
	const float og = output_grad[i]; // This is output_grad[bi, xi, yi]
	atomicAdd(grad_f + bi * im_width * im_height + x0 * im_width + y0, w00 * og);
	atomicAdd(grad_f + bi * im_width * im_height + x0 * im_width + y1, w01 * og);
	atomicAdd(grad_f + bi * im_width * im_height + x1 * im_width + y0, w10 * og);
	atomicAdd(grad_f + bi * im_width * im_height + x1 * im_width + y1, w11 * og);
	}
	}
	""")

	pycuda_fct = mod.get_function("my_fct")
	inputs = [storage_map[v] for v in node.inputs]
	outputs = [storage_map[v] for v in node.outputs]

	def thunk():
	z = outputs[0]
	outshape = (inputs[0][0].shape[0], inputs[1][0].shape[0], inputs[1][0].shape[1])
	if z[0] is None or z[0].shape != outshape:
	z[0] = cuda.CudaNdarray.zeros(outshape)
	n = 1024.
	grid = (int(np.ceil(np.prod(outshape) / n)), 1)
	pycuda_fct(inputs[0][0],
	inputs[1][0],
	inputs[2][0],
	z[0],
	np.intc(outshape[0]),
	np.intc(outshape[1]),
	np.intc(outshape[2]),
	np.intc(inputs[0][0].shape[1]),
	np.intc(inputs[0][0].shape[2]),
	np.intc(np.prod(outshape)),
	block=(int(n), 1, 1), grid=grid)
	thunk.lazy = False
	return thunk


	def testgrad():

	import theano.tensor as T
	op = LinearInterpolationGradCUDAOp()
	og_t = T.ftensor3()
	w_t = T.ftensor3()
	i_t = T.ftensor3()
	y_t = op(og_t, i_t, w_t)
	y_f = theano.function([og_t, i_t, w_t], y_t)

	core = (300, 400)
	batch = (32,)
	#og = np.arange(2 * 3 * 4).reshape(2, 3, 4).astype('float32')
	og = np.random.randn(*(batch + core)).astype('float32')
	w = np.random.randn(*(core + (4,))).astype('float32')
	i = np.random.randint(0, 3, size=core + (4,)).astype('float32')

	print og.shape, i.shape, w.shape
	print 'x\n', og
	print 'i\n', i
	print 'w\n', w

	import time
	begin = time.time()
	for a in range(100):
	y = y_f(og, i, w)
	end = time.time()
	print 'time gpu:', end - begin
	y = np.asarray(y)
	print y

	from linear_interpolation_kernels import *
	y2 = np.zeros(batch + core).astype('float64')
	og = og.astype('float64')
	i = i.astype('int64')
	w = w.astype('float64')

	begin = time.time()
	for a in range(100):
	linear_interpolation_grad_precomputed(og.astype('float64'), i.astype('int64'), w.astype('float64'), y2)
	end = time.time()
	print 'time cpu:', end - begin
	print y2
	print(np.sum(np.abs(y-y2)))