Last active
November 6, 2019 09:56
-
-
Save Ext3h/6eb2df21873f5524bfd70a0368872d4e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <thread> | |
#include <future> | |
#include <iostream> | |
#include <string> | |
#include <algorithm> | |
#include <cuda.h> | |
#pragma comment(lib, "cuda.lib") | |
// Scale up to enqueue more load until PCIe breaks | |
#define MEMCOPY_ITERATIONS 500 | |
// Scale up to increase PCIe load without increasing driver overhead | |
const size_t MEMCOPY_SIZE = (1 << 27); // 128M | |
#define CHECK(expression) test::check(expression, #expression, ctx, __FILE__, __LINE__); | |
namespace test { | |
class container { | |
public: | |
std::vector<void*> hostAllocations; | |
std::vector<CUdeviceptr> gpuAllocations; | |
CUdevice dev; | |
CUcontext ctx; | |
int dev_id; | |
}; | |
void check(CUresult result, const char* command, const container &ctx, const char* file, int line) { | |
if (result != CUDA_SUCCESS) | |
{ | |
static std::mutex cerr_mutex; | |
std::lock_guard<std::mutex> lock(cerr_mutex); | |
const char* error = nullptr; | |
cuGetErrorName(result, &error); | |
std::cerr << "CUDA error " << error << " (" << (int)result << ") in " << file << ":" << line << " on device " << ctx.dev_id << std::endl; | |
std::cerr << command << std::endl; | |
abort(); | |
} | |
} | |
void init(const std::vector<int> &device_ids, std::vector<container> &ctxs) { | |
ctxs.resize(device_ids.size()); | |
for (int i = 0; i < device_ids.size(); i++) | |
{ | |
auto &ctx = ctxs[i]; | |
ctx.dev_id = device_ids[i]; | |
CHECK(cuDeviceGet(&ctx.dev, ctx.dev_id)); | |
CHECK(cuDevicePrimaryCtxSetFlags(ctx.dev_id, CU_CTX_SCHED_BLOCKING_SYNC)); | |
CHECK(cuDevicePrimaryCtxRetain(&ctx.ctx, ctx.dev_id)); | |
ctx.hostAllocations.resize(4); | |
ctx.gpuAllocations.resize(std::max((size_t)8, device_ids.size() * 2)); | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
for (auto &host : ctx.hostAllocations) | |
{ | |
CHECK(cuMemHostAlloc(&host, MEMCOPY_SIZE, CU_MEMHOSTALLOC_DEVICEMAP)); | |
} | |
for (auto &gpu : ctx.gpuAllocations) | |
{ | |
CHECK(cuMemAlloc(&gpu, MEMCOPY_SIZE)); | |
} | |
CHECK(cuCtxPopCurrent(nullptr)); | |
} | |
} | |
void deinit(std::vector<container> &ctxs) { | |
for (auto& ctx : ctxs) | |
{ | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
for (auto &host : ctx.hostAllocations) | |
{ | |
CHECK(cuMemFreeHost(host)); | |
} | |
for (auto &gpu : ctx.gpuAllocations) | |
{ | |
CHECK(cuMemFree(gpu)); | |
} | |
CHECK(cuCtxPopCurrent(nullptr)); | |
CHECK(cuDevicePrimaryCtxRelease(ctx.dev)); | |
} | |
ctxs.resize(0); | |
} | |
float bandwidth(float time) { | |
return (float)MEMCOPY_SIZE * MEMCOPY_ITERATIONS / 1024 / 1024 / 1024 / time * 1000; | |
} | |
void bandwidth_host_to_device(std::vector<container> &ctxs) | |
{ | |
std::cout << "Host to device bandwidth test" << std::endl; | |
std::vector<std::future<std::tuple<int, float>>> workers; | |
for (auto &ctx : ctxs) | |
{ | |
workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> { | |
float elapsed; | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
CUstream stream_up; | |
CUevent start; | |
CUevent stop; | |
CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuEventCreate(&start, 0)); | |
CHECK(cuEventCreate(&stop, 0)); | |
CHECK(cuEventRecord(start, stream_up)); | |
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) | |
{ | |
CHECK(cuMemcpyHtoDAsync( | |
ctx.gpuAllocations[i % ctx.gpuAllocations.size()], | |
ctx.hostAllocations[i % ctx.hostAllocations.size()], | |
MEMCOPY_SIZE, stream_up | |
)); | |
} | |
CHECK(cuEventRecord(stop, stream_up)); | |
CHECK(cuEventSynchronize(stop)); | |
CHECK(cuEventElapsedTime(&elapsed, start, stop)); | |
CHECK(cuEventDestroy(start)); | |
CHECK(cuEventDestroy(stop)); | |
CHECK(cuStreamDestroy(stream_up)); | |
CHECK(cuCtxPopCurrent(nullptr)); | |
return std::make_tuple(ctx.dev_id, elapsed); | |
})); | |
}; | |
for (auto &worker : workers) | |
{ | |
int dev_id; | |
float time; | |
std::tie(dev_id, time) = worker.get(); | |
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl; | |
} | |
} | |
void bandwidth_device_to_host(std::vector<container> &ctxs) | |
{ | |
std::cout << "Device to host bandwidth test" << std::endl; | |
std::vector<std::future<std::tuple<int, float>>> workers; | |
for (auto &ctx : ctxs) | |
{ | |
workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> { | |
float elapsed; | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
CUstream stream_down; | |
CUevent start; | |
CUevent stop; | |
CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuEventCreate(&start, 0)); | |
CHECK(cuEventCreate(&stop, 0)); | |
CHECK(cuEventRecord(start, stream_down)); | |
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) | |
{ | |
CHECK(cuMemcpyDtoHAsync( | |
ctx.hostAllocations[i % ctx.hostAllocations.size()], | |
ctx.gpuAllocations[i % ctx.gpuAllocations.size()], | |
MEMCOPY_SIZE, stream_down | |
)); | |
} | |
CHECK(cuEventRecord(stop, stream_down)); | |
CHECK(cuEventSynchronize(stop)); | |
CHECK(cuEventElapsedTime(&elapsed, start, stop)); | |
CHECK(cuEventDestroy(start)); | |
CHECK(cuEventDestroy(stop)); | |
CHECK(cuStreamDestroy(stream_down)); | |
CHECK(cuCtxPopCurrent(nullptr)); | |
return std::make_tuple(ctx.dev_id, elapsed); | |
})); | |
}; | |
for (auto &worker : workers) | |
{ | |
int dev_id; | |
float time; | |
std::tie(dev_id, time) = worker.get(); | |
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl; | |
} | |
} | |
void bandwidth_bidrectional(std::vector<container> &ctxs) | |
{ | |
std::cout << "Bidirectional multi-stream bandwidth test" << std::endl; | |
std::vector<std::future<std::tuple<int, float>>> workers; | |
for (auto &ctx : ctxs) | |
{ | |
workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> { | |
float elapsed; | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
CUstream stream_up; | |
CUstream stream_down; | |
CUevent start_up; | |
CUevent start_down; | |
CUevent stop_up; | |
CUevent stop_down; | |
CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuEventCreate(&start_up, 0)); | |
CHECK(cuEventCreate(&start_down, 0)); | |
CHECK(cuEventCreate(&stop_up, 0)); | |
CHECK(cuEventCreate(&stop_down, 0)); | |
CHECK(cuEventRecord(start_up, stream_up)); | |
CHECK(cuEventRecord(start_down, stream_down)); | |
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) | |
{ | |
CHECK(cuMemcpyHtoDAsync( | |
ctx.gpuAllocations[(i * 2) % ctx.gpuAllocations.size()], | |
ctx.hostAllocations[(i * 2) % ctx.hostAllocations.size()], | |
MEMCOPY_SIZE, | |
stream_up | |
)); | |
CHECK(cuMemcpyDtoHAsync( | |
ctx.hostAllocations[(i * 2 + 1) % ctx.hostAllocations.size()], | |
ctx.gpuAllocations[(i * 2 + 1) % ctx.gpuAllocations.size()], | |
MEMCOPY_SIZE, | |
stream_down | |
)); | |
} | |
CHECK(cuEventRecord(stop_up, stream_up)); | |
CHECK(cuEventRecord(stop_down, stream_down)); | |
CHECK(cuEventSynchronize(stop_up)); | |
CHECK(cuEventSynchronize(stop_down)); | |
{ | |
float a, b, c, d; | |
CHECK(cuEventElapsedTime(&a, start_up, stop_up)); | |
CHECK(cuEventElapsedTime(&b, start_up, stop_down)); | |
CHECK(cuEventElapsedTime(&c, start_down, stop_up)); | |
CHECK(cuEventElapsedTime(&d, start_down, stop_down)); | |
elapsed = std::max({ a, b, c, d }); | |
} | |
CHECK(cuEventDestroy(start_up)); | |
CHECK(cuEventDestroy(start_down)); | |
CHECK(cuEventDestroy(stop_up)); | |
CHECK(cuEventDestroy(stop_down)); | |
CHECK(cuStreamDestroy(stream_up)); | |
CHECK(cuStreamDestroy(stream_down)); | |
CHECK(cuCtxPopCurrent(nullptr)); | |
return std::make_tuple(ctx.dev_id, elapsed); | |
})); | |
}; | |
for (auto &worker : workers) | |
{ | |
int dev_id; | |
float time; | |
std::tie(dev_id, time) = worker.get(); | |
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time/2) << "GB/s)" << std::endl; | |
} | |
} | |
void bandwidth_device_to_device_gather(std::vector<container> &ctxs, container &target) | |
{ | |
std::cout << "Device to device peer2peer bandwidth test, target GPU " << target.dev_id << std::endl; | |
std::vector<std::future<std::tuple<int, float>>> workers; | |
int index = 0; | |
for (auto &ctx : ctxs) | |
{ | |
if (&ctx == &target) | |
{ | |
continue; | |
} | |
workers.push_back(std::async(std::launch::async, [target, index, ctx]() -> std::tuple<int, float> { | |
float elapsed; | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
CUstream stream; | |
CUevent start; | |
CUevent stop; | |
CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuEventCreate(&start, 0)); | |
CHECK(cuEventCreate(&stop, 0)); | |
CHECK(cuEventRecord(start, stream)); | |
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) | |
{ | |
CHECK(cuMemcpyPeerAsync( | |
target.gpuAllocations[(i * index) % target.gpuAllocations.size()], | |
target.ctx, | |
ctx.gpuAllocations[i % ctx.gpuAllocations.size()], | |
ctx.ctx, | |
MEMCOPY_SIZE, stream | |
)); | |
} | |
CHECK(cuEventRecord(stop, stream)); | |
CHECK(cuEventSynchronize(stop)); | |
CHECK(cuEventElapsedTime(&elapsed, start, stop)); | |
CHECK(cuEventDestroy(start)); | |
CHECK(cuEventDestroy(stop)); | |
CHECK(cuStreamDestroy(stream)); | |
CHECK(cuCtxPopCurrent(nullptr)); | |
return std::make_tuple(ctx.dev_id, elapsed); | |
})); | |
index++; | |
}; | |
for (auto &worker : workers) | |
{ | |
int dev_id; | |
float time; | |
std::tie(dev_id, time) = worker.get(); | |
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl; | |
} | |
} | |
void bandwidth_device_to_device_scatter(std::vector<container> &ctxs, container &source) | |
{ | |
std::cout << "Device to device peer2peer bandwidth test, source GPU " << source.dev_id << std::endl; | |
std::vector<std::future<std::tuple<int, float>>> workers; | |
int index = 0; | |
for (auto &ctx : ctxs) | |
{ | |
if (&ctx == &source) | |
{ | |
continue; | |
} | |
workers.push_back(std::async(std::launch::async, [source, index, ctx]() -> std::tuple<int, float> { | |
float elapsed; | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
CUevent start; | |
CUevent stop; | |
CUstream stream; | |
CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuEventCreate(&start, 0)); | |
CHECK(cuEventCreate(&stop, 0)); | |
CHECK(cuEventRecord(start, stream)); | |
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) | |
{ | |
CHECK(cuMemcpyPeerAsync( | |
ctx.gpuAllocations[i % ctx.gpuAllocations.size()], | |
ctx.ctx, | |
source.gpuAllocations[i % source.gpuAllocations.size()], | |
source.ctx, | |
MEMCOPY_SIZE, stream | |
)); | |
} | |
CHECK(cuEventRecord(stop, stream)); | |
CHECK(cuEventSynchronize(stop)); | |
CHECK(cuEventElapsedTime(&elapsed, start, stop)); | |
CHECK(cuEventDestroy(start)); | |
CHECK(cuEventDestroy(stop)); | |
CHECK(cuStreamDestroy(stream)); | |
CHECK(cuCtxPopCurrent(nullptr)); | |
return std::make_tuple(ctx.dev_id, elapsed); | |
})); | |
index++; | |
}; | |
for (auto &worker : workers) | |
{ | |
int dev_id; | |
float time; | |
std::tie(dev_id, time) = worker.get(); | |
std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl; | |
} | |
} | |
void bandwidth_device_to_device_shift(std::vector<container> &ctxs) | |
{ | |
std::cout << "Device to device peer2peer bandwidth test, target = source + 1" << std::endl; | |
std::vector<std::future<std::tuple<int, float>>> workers; | |
for (int i = 0; i < ctxs.size(); i++) | |
{ | |
auto &source = ctxs[i]; | |
auto &target = ctxs[(i + 1) % ctxs.size()]; | |
if (&source == &target) | |
{ | |
continue; | |
} | |
workers.push_back(std::async(std::launch::async, [source, target]() -> std::tuple<int, float> { | |
float elapsed; | |
auto &ctx = source; | |
CHECK(cuCtxPushCurrent(ctx.ctx)); | |
CUevent start; | |
CUevent stop; | |
CUstream stream; | |
CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); | |
CHECK(cuEventCreate(&start, 0)); | |
CHECK(cuEventCreate(&stop, 0)); | |
CHECK(cuEventRecord(start, stream)); | |
for (int i = 0; i < MEMCOPY_ITERATIONS; i++) | |
{ | |
CHECK(cuMemcpyPeerAsync( | |
target.gpuAllocations[(i * 2 + 1) % target.gpuAllocations.size()], | |
target.ctx, | |
source.gpuAllocations[(i * 2) % source.gpuAllocations.size()], | |
source.ctx, | |
MEMCOPY_SIZE, stream | |
)); | |
} | |
CHECK(cuEventRecord(stop, stream)); | |
CHECK(cuEventSynchronize(stop)); | |
CHECK(cuEventElapsedTime(&elapsed, start, stop)); | |
CHECK(cuEventDestroy(start)); | |
CHECK(cuEventDestroy(stop)); | |
CHECK(cuStreamDestroy(stream)); | |
CHECK(cuCtxPopCurrent(nullptr)); | |
return std::make_tuple(source.dev_id, elapsed); | |
})); | |
}; | |
for (auto &worker : workers) | |
{ | |
int dev_id; | |
float time; | |
std::tie(dev_id, time) = worker.get(); | |
std::cout << "GPU " << dev_id << " (source) took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl; | |
} | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
std::vector<int> device_ids; | |
std::vector<test::container> ctxs; | |
if (argc == 1) | |
{ | |
std::cout << "usage: " << argv[0] << " deviceID deviceID...\n"; | |
std::cout << "defaulting to test all devices\n"; | |
} | |
if (cuInit(0) != CUDA_SUCCESS) | |
{ | |
std::cout << "cuInit failed, aborting...\n"; | |
exit(1); | |
} | |
if(argc > 1) | |
{ | |
for (int i = 0; i < argc - 1; i++) | |
{ | |
int dev = atoi(argv[i + 1]); | |
CUdevice device; | |
if (cuDeviceGet(&device, dev) != CUDA_SUCCESS) | |
{ | |
std::cout << "Could not get device " << dev << ", aborting\n"; | |
exit(1); | |
} | |
device_ids.push_back(dev); | |
} | |
} | |
else | |
{ | |
int deviceCount = 0; | |
cuDeviceGetCount(&deviceCount); | |
for (int dev = 0; dev < deviceCount; dev++) | |
{ | |
CUdevice device; | |
if (cuDeviceGet(&device, dev) != CUDA_SUCCESS) | |
{ | |
std::cout << "Could not get device " << dev << ", aborting\n"; | |
exit(1); | |
} | |
device_ids.push_back(dev); | |
} | |
} | |
test::init(device_ids, ctxs); | |
test::bandwidth_host_to_device(ctxs); | |
test::bandwidth_device_to_host(ctxs); | |
test::bandwidth_bidrectional(ctxs); | |
if (ctxs.size() > 2) | |
{ | |
for (auto &ctx : ctxs) | |
{ | |
test::bandwidth_device_to_device_gather(ctxs, ctx); | |
} | |
for (auto &ctx : ctxs) | |
{ | |
test::bandwidth_device_to_device_scatter(ctxs, ctx); | |
} | |
} | |
if (ctxs.size() > 1) | |
{ | |
test::bandwidth_device_to_device_shift(ctxs); | |
} | |
test::deinit(ctxs); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment