-
-
Save rjpower/5979059 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Compare speed of several methods of copying data between two GPU devices. | |
""" | |
import atexit, ctypes, re, time | |
import numpy as np | |
import pycuda.driver as drv | |
import pycuda.gpuarray as gpuarray | |
import pycuda.autoinit | |
import sys | |
if len(sys.argv) < 2: | |
bytes = 100 * 1000 * 1000 | |
else: | |
bytes = int(sys.argv[1]) | |
host_buffer = None | |
x = np.ones((bytes,), dtype=np.byte) | |
def func_timer(f, a, b): | |
def wrapper(*args, **kwargs): | |
start = time.time() | |
for i in range(10): | |
res = f(*args, **kwargs) | |
stop = time.time() | |
print bytes, a, b, f.__name__, '%.5f' % (stop - start) | |
return res | |
return wrapper | |
def sync_ctx(ctx): | |
ctx.push() | |
ctx.synchronize() | |
ctx.pop() | |
def force_sync(dest): | |
dest[100:101].get()[0] | |
# Expose default memory copy function: | |
if drv.get_driver_version() >= 4000: | |
cuda = ctypes.cdll.LoadLibrary('libcuda.so') | |
cuda.cuMemcpy.restype = int | |
cuda.cuMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int] | |
def memcpy_peer_uva(dest, src, size, dest_ctx=None, src_ctx=None): | |
res = cuda.cuMemcpy(dest.ptr, src.ptr, size) | |
assert res == 0, res | |
force_sync(dest) | |
def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx): | |
# Make src_context current and copy data to the host: | |
src_ctx.push() | |
drv.memcpy_dtoh(host_buffer, src.ptr) | |
src_ctx.pop() | |
# Make dest_context current and copy data from the host: | |
dest_ctx.push() | |
drv.memcpy_htod(dest.ptr, host_buffer) | |
dest_ctx.pop() | |
force_sync(dest) | |
def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None): | |
drv.memcpy_peer(dest.ptr, src.ptr, size, dest_ctx, src_ctx) | |
force_sync(dest) | |
def enable_peer_access(a, b): | |
try: | |
a.push() | |
a.enable_peer_access(b) | |
finally: | |
a.pop() | |
def test_devices(a, b): | |
global host_buffer | |
dev0 = drv.Device(a) | |
dev1 = drv.Device(b) | |
ctx0 = dev0.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC) | |
host_buffer = drv.pagelocked_empty(bytes, np.byte) | |
x_gpu = gpuarray.to_gpu(x) | |
ctx1 = dev1.make_context(drv.ctx_flags.SCHED_BLOCKING_SYNC) | |
y_gpu = gpuarray.zeros((bytes,), dtype=np.byte) | |
func_timer(memcpy_peer_host, a, b)(y_gpu, x_gpu, x_gpu.dtype.itemsize * x_gpu.size, ctx1, ctx0) | |
func_timer(memcpy_peer_uva, a, b)(y_gpu, x_gpu, x_gpu.dtype.itemsize * x_gpu.size, ctx1, ctx0) | |
try: | |
enable_peer_access(ctx0, ctx1) | |
enable_peer_access(ctx1, ctx0) | |
func_timer(memcpy_peer_peer, a, b)(y_gpu, x_gpu, x_gpu.dtype.itemsize * x_gpu.size, ctx1, ctx0) | |
except: | |
print bytes, a, b, 'memcpy_peer_peer', 0 | |
ctx1.pop() | |
ctx0.pop() | |
if __name__ == '__main__': | |
drv.init() | |
count = drv.Device.count() | |
for i in range(count): | |
for j in range(i + 1, count): | |
test_devices(i, j) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment