Created
March 2, 2016 10:45
-
-
Save tscohen/a87689e9133a58c42e2a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import theano | |
import theano.misc.pycuda_init | |
from pycuda.compiler import SourceModule | |
import theano.sandbox.cuda as cuda | |
from theano.sandbox.cuda import GpuOp | |
class LinearInterpolationCUDAOp(GpuOp): | |
__props__ = () | |
#def __init__(self, inds): | |
# self.inds = inds | |
def make_node(self, f, coords, wts): | |
# Only float32 is supported on GPU, so automatically convert float64 to float32 | |
if coords.dtype == 'float64': | |
coords = coords.astype('float32') | |
if wts.dtype == 'float64': | |
wts = wts.astype('float32') | |
assert f.dtype == 'float32' | |
assert coords.dtype == 'float32' | |
assert wts.dtype == 'float32' | |
f = cuda.basic_ops.gpu_contiguous( | |
cuda.basic_ops.as_cuda_ndarray_variable(f)) | |
coords = cuda.basic_ops.gpu_contiguous( | |
cuda.basic_ops.as_cuda_ndarray_variable(coords)) | |
wts = cuda.basic_ops.gpu_contiguous( | |
cuda.basic_ops.as_cuda_ndarray_variable(wts)) | |
return theano.Apply(self, [f, coords, wts], [f.type()]) | |
def make_thunk(self, node, storage_map, _, _2): | |
mod = SourceModule(""" | |
__global__ void linear_interpolation(float* f, | |
float* coords, | |
float* wts, | |
float* output, | |
int batch_size, | |
int out_height, | |
int out_width, | |
int im_height, | |
int im_width, | |
int size) | |
{ | |
int i = blockIdx.x * blockDim.x + threadIdx.x; | |
if(i < size) { | |
// Compute batch index, x-coordinate and y-coordinate from index i: | |
// The dimensions of output are (batch, y, x) | |
const int bi = i / (out_width * out_height); | |
const int yi = (i / out_width) % out_height; | |
const int xi = i % out_width; | |
const int x0 = coords[yi * out_width * 4 + xi * 4]; | |
const int x1 = coords[yi * out_width * 4 + xi * 4 + 1]; | |
const int y0 = coords[yi * out_width * 4 + xi * 4 + 2]; | |
const int y1 = coords[yi * out_width * 4 + xi * 4 + 3]; | |
const float w00 = wts[yi * out_width * 4 + xi * 4]; | |
const float w01 = wts[yi * out_width * 4 + xi * 4 + 1]; | |
const float w10 = wts[yi * out_width * 4 + xi * 4 + 2]; | |
const float w11 = wts[yi * out_width * 4 + xi * 4 + 3]; | |
const float f00 = f[bi * im_width * im_height + x0 * im_width + y0]; | |
const float f01 = f[bi * im_width * im_height + x0 * im_width + y1]; | |
const float f10 = f[bi * im_width * im_height + x1 * im_width + y0]; | |
const float f11 = f[bi * im_width * im_height + x1 * im_width + y1]; | |
output[i] = w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11; | |
} | |
} | |
""") | |
pycuda_fct = mod.get_function("linear_interpolation") | |
inputs = [storage_map[v] for v in node.inputs] | |
outputs = [storage_map[v] for v in node.outputs] | |
def thunk(): | |
z = outputs[0] | |
outshape = (inputs[0][0].shape[0], inputs[1][0].shape[0], inputs[1][0].shape[1]) | |
if z[0] is None or z[0].shape != outshape: | |
z[0] = cuda.CudaNdarray.zeros(outshape) | |
n = 2 ** 10. | |
grid = (int(np.ceil(np.prod(outshape) / n)), 1) | |
pycuda_fct(inputs[0][0], | |
inputs[1][0], | |
inputs[2][0], | |
z[0], | |
np.intc(outshape[0]), | |
np.intc(outshape[1]), | |
np.intc(outshape[2]), | |
np.intc(inputs[0][0].shape[1]), | |
np.intc(inputs[0][0].shape[2]), | |
np.intc(np.prod(outshape)), | |
block=(int(n), 1, 1), grid=grid) | |
thunk.lazy = False | |
return thunk | |
def grad(self, inputs, output_grads): | |
return [LinearInterpolationGradCUDAOp()(output_grads[0], inputs[1], inputs[2]), | |
theano.gradient.grad_not_implemented(self, 1, inputs[1]), | |
theano.gradient.grad_not_implemented(self, 2, inputs[2])] | |
def test2(): | |
from linear_interpolation_kernels import * | |
import theano.tensor as T | |
op = LinearInterpolationCUDAOp() | |
x_t = T.ftensor3() | |
w_t = T.ftensor3() | |
i_t = T.ftensor3() | |
y_t = op(x_t, i_t, w_t) | |
y_f = theano.function([x_t, i_t, w_t], y_t) | |
cost = T.sum(y_t) | |
y_grad_t = T.grad(cost, [x_t]) | |
y_grad_f = theano.function([x_t, i_t, w_t], y_grad_t) | |
core = (300, 400) | |
batch = (32,) | |
#x = np.arange(2 * 3 * 4).reshape(2, 3, 4).astype('float32') | |
x = np.random.randn(*(batch + core)).astype('float32') | |
w = np.random.randn(*(core + (4,))).astype('float32') | |
i = np.random.randint(0, 300, size=core + (4,)).astype('float32') | |
yg = y_grad_f(x, i, w)[0] | |
print 'GRAD:', yg, yg.shape | |
#yg2 = np.zeros(batch + core) | |
#linear_interpolation_grad_precomputed(x.astype('float64'), i.astype('int64'), w.astype('float64'), yg2) | |
print x.shape, i.shape, w.shape | |
import time | |
begin = time.time() | |
for a in range(100): | |
y = y_f(x, i, w) | |
end = time.time() | |
print 'time gpu:', end - begin | |
y = np.asarray(y) | |
#print np.asarray(y) | |
#print 'x\n', x | |
#print 'i\n', i | |
#print 'w\n', w | |
y2 = np.zeros(batch + core).astype('float64') | |
x = x.astype('float64') | |
i = i.astype('int64') | |
w = w.astype('float64') | |
begin = time.time() | |
for a in range(100): | |
linear_interpolation_precomputed(x.astype('float64'), i.astype('int64'), w.astype('float64'), y2) | |
end = time.time() | |
print 'time cpu', end - begin | |
#print y2 | |
#print y | |
print(np.sum(np.abs(y-y2))) | |
#return y, y2 | |
#print np.unique(y) | |
#print np.unique(y2) | |
#print y - y2 | |
class LinearInterpolationGradCUDAOp(GpuOp): | |
__props__ = () | |
def make_node(self, output_grad, coords, wts): | |
output_grad = cuda.basic_ops.gpu_contiguous( | |
cuda.basic_ops.as_cuda_ndarray_variable(output_grad)) | |
coords = cuda.basic_ops.gpu_contiguous( | |
cuda.basic_ops.as_cuda_ndarray_variable(coords)) | |
wts = cuda.basic_ops.gpu_contiguous( | |
cuda.basic_ops.as_cuda_ndarray_variable(wts)) | |
assert output_grad.dtype == 'float32' | |
assert coords.dtype == 'float32' | |
assert wts.dtype == 'float32' | |
return theano.Apply(self, [output_grad, coords, wts], [output_grad.type()]) | |
def make_thunk(self, node, storage_map, _, _2): | |
mod = SourceModule(""" | |
__global__ void my_fct(float* output_grad, | |
float* coords, | |
float* wts, | |
float* grad_f, | |
int batch_size, | |
int out_height, | |
int out_width, | |
int im_height, | |
int im_width, | |
int size) | |
{ | |
int i = blockIdx.x * blockDim.x + threadIdx.x; | |
if(i < size) { | |
// Compute batch index, x-coordinate and y-coordinate from index i: | |
// The dimensions of output are (batch, y, x) | |
const int bi = i / (out_width * out_height); | |
const int yi = (i / out_width) % out_height; | |
const int xi = i % out_width; | |
const int x0 = coords[yi * out_width * 4 + xi * 4]; | |
const int x1 = coords[yi * out_width * 4 + xi * 4 + 1]; | |
const int y0 = coords[yi * out_width * 4 + xi * 4 + 2]; | |
const int y1 = coords[yi * out_width * 4 + xi * 4 + 3]; | |
const float w00 = wts[yi * out_width * 4 + xi * 4]; | |
const float w01 = wts[yi * out_width * 4 + xi * 4 + 1]; | |
const float w10 = wts[yi * out_width * 4 + xi * 4 + 2]; | |
const float w11 = wts[yi * out_width * 4 + xi * 4 + 3]; | |
// w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11; | |
const float og = output_grad[i]; // This is output_grad[bi, xi, yi] | |
atomicAdd(grad_f + bi * im_width * im_height + x0 * im_width + y0, w00 * og); | |
atomicAdd(grad_f + bi * im_width * im_height + x0 * im_width + y1, w01 * og); | |
atomicAdd(grad_f + bi * im_width * im_height + x1 * im_width + y0, w10 * og); | |
atomicAdd(grad_f + bi * im_width * im_height + x1 * im_width + y1, w11 * og); | |
} | |
} | |
""") | |
pycuda_fct = mod.get_function("my_fct") | |
inputs = [storage_map[v] for v in node.inputs] | |
outputs = [storage_map[v] for v in node.outputs] | |
def thunk(): | |
z = outputs[0] | |
outshape = (inputs[0][0].shape[0], inputs[1][0].shape[0], inputs[1][0].shape[1]) | |
if z[0] is None or z[0].shape != outshape: | |
z[0] = cuda.CudaNdarray.zeros(outshape) | |
n = 1024. | |
grid = (int(np.ceil(np.prod(outshape) / n)), 1) | |
pycuda_fct(inputs[0][0], | |
inputs[1][0], | |
inputs[2][0], | |
z[0], | |
np.intc(outshape[0]), | |
np.intc(outshape[1]), | |
np.intc(outshape[2]), | |
np.intc(inputs[0][0].shape[1]), | |
np.intc(inputs[0][0].shape[2]), | |
np.intc(np.prod(outshape)), | |
block=(int(n), 1, 1), grid=grid) | |
thunk.lazy = False | |
return thunk | |
def testgrad(): | |
import theano.tensor as T | |
op = LinearInterpolationGradCUDAOp() | |
og_t = T.ftensor3() | |
w_t = T.ftensor3() | |
i_t = T.ftensor3() | |
y_t = op(og_t, i_t, w_t) | |
y_f = theano.function([og_t, i_t, w_t], y_t) | |
core = (300, 400) | |
batch = (32,) | |
#og = np.arange(2 * 3 * 4).reshape(2, 3, 4).astype('float32') | |
og = np.random.randn(*(batch + core)).astype('float32') | |
w = np.random.randn(*(core + (4,))).astype('float32') | |
i = np.random.randint(0, 3, size=core + (4,)).astype('float32') | |
print og.shape, i.shape, w.shape | |
print 'x\n', og | |
print 'i\n', i | |
print 'w\n', w | |
import time | |
begin = time.time() | |
for a in range(100): | |
y = y_f(og, i, w) | |
end = time.time() | |
print 'time gpu:', end - begin | |
y = np.asarray(y) | |
print y | |
from linear_interpolation_kernels import * | |
y2 = np.zeros(batch + core).astype('float64') | |
og = og.astype('float64') | |
i = i.astype('int64') | |
w = w.astype('float64') | |
begin = time.time() | |
for a in range(100): | |
linear_interpolation_grad_precomputed(og.astype('float64'), i.astype('int64'), w.astype('float64'), y2) | |
end = time.time() | |
print 'time cpu:', end - begin | |
print y2 | |
print(np.sum(np.abs(y-y2))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment