Last active
April 29, 2023 21:10
-
-
Save lebedov/3078644 to your computer and use it in GitHub Desktop.
Compare speed of several methods of copying data between two GPU devices
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Compare speed of several methods of copying data between two GPU devices. | |
""" | |
import atexit, re, time | |
import numpy as np | |
import pycuda.driver as drv | |
import pycuda.gpuarray as gpuarray | |
def func_timer(f): | |
def wrapper(*args, **kwargs): | |
start = time.time() | |
res = f(*args, **kwargs) | |
stop = time.time() | |
print 'execution time = %.5f s' % (stop-start) | |
return res | |
return wrapper | |
# Expose default memory copy function: | |
if drv.get_driver_version() >= 4000: | |
def memcpy_peer_uva(dest, src, size): | |
drv.memcpy_dtod(dest, src, size) | |
def memcpy_peer_host(dest, src, size, dest_ctx, src_ctx): | |
# Created pinned host memory buffer; copying to and from it is | |
# slightly faster than with an ordinary array as the size of the | |
# data copied increases: | |
host_buffer = drv.pagelocked_empty(size, np.byte) | |
# Make src_context current and copy data to the host: | |
src_ctx.push() | |
drv.memcpy_dtoh(host_buffer, src) | |
src_ctx.pop() | |
# Make dest_context current and copy data from the host: | |
dest_ctx.push() | |
drv.memcpy_htod(dest, host_buffer) | |
dest_ctx.pop() | |
def memcpy_peer_peer(dest, src, size, dest_ctx=None, src_ctx=None): | |
drv.memcpy_peer(dest, src, size, dest_ctx, src_ctx) | |
if __name__ == '__main__': | |
# Set up devices: | |
drv.init() | |
dev0 = drv.Device(0) | |
if dev0.count() < 2: | |
raise ValueError('need more than one GPU to run') | |
dev1 = drv.Device(1) | |
ctx0 = dev0.make_context() | |
ctx1 = dev1.make_context() | |
atexit.register(ctx0.pop) | |
atexit.register(ctx1.pop) | |
ctx1.pop() | |
ctx0.push() | |
x = np.random.rand(5*10**5) | |
x_gpu = gpuarray.to_gpu(x) | |
ctx0.pop() | |
ctx1.push() | |
y_gpu = gpuarray.zeros_like(x_gpu) | |
func_timer(memcpy_peer_host)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size, | |
ctx1, ctx0) | |
if not np.allclose(x, y_gpu.get()): | |
print 'host copy failed' | |
if drv.get_driver_version() < 4000: | |
print 'need CUDA 4.0 or later to test UVA copy' | |
else: | |
y_gpu = gpuarray.zeros_like(x_gpu) | |
func_timer(memcpy_peer_uva)(y_gpu.ptr, x_gpu.ptr, x_gpu.dtype.itemsize*x_gpu.size) | |
if not np.allclose(x, y_gpu.get()): | |
print 'UVA copy failed' | |
if not (re.match('Tesla', dev0.name()) and \ | |
re.match('Tesla', dev1.name())): | |
print 'not testing peer-to-peer copy on non-Tesla devices' | |
else: | |
ctx1.enable_peer_access(ctx0) | |
ctx1.pop() | |
ctx0.push() | |
ctx0.enable_peer_access(ctx1) | |
ctx0.pop() | |
ctx1.push() | |
y_gpu = gpuarray.zeros_like(x_gpu) | |
func_timer(memcpy_peer_peer)(y_gpu.ptr, x_gpu.ptr, | |
x_gpu.dtype.itemsize*x_gpu.size, ctx1, ctx0) | |
if not np.allclose(x, y_gpu.get()): | |
print 'Peer-to-peer copy failed' |
Will do, thanks.
Have a good one.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@MarwanAbdelatti Sounds plausible - wouldn't hurt to ask the cupy developers about it.