Ext3h · November 6, 2019 09:56
diff --git a/cuda_copy_bandwidth.cpp b/cuda_copy_bandwidth.cpp
 #include <vector>
 #include <thread>
 #include <future>
 #include <iostream>
 #include <string>
 #include <algorithm>

 #include <cuda.h>
 #pragma comment(lib, "cuda.lib")


 // Scale up to enqueue more load until PCIe breaks
 #define MEMCOPY_ITERATIONS 500
 // Scale up to increase PCIe load without increasing driver overhead
 const size_t MEMCOPY_SIZE = (1 << 27); // 128M

 #define CHECK(expression) test::check(expression, #expression, ctx, __FILE__, __LINE__); 

 namespace test {
 	class container {
 	public:
 		std::vector<void*> hostAllocations;
 		std::vector<CUdeviceptr> gpuAllocations;
 		CUdevice dev;
 		CUcontext ctx;
 		int dev_id;
 	};

 	void check(CUresult result, const char* command, const container &ctx, const char* file, int line) {
 		if (result  != CUDA_SUCCESS)
 		{
 			static std::mutex cerr_mutex;
 			std::lock_guard<std::mutex> lock(cerr_mutex);
 			const char* error = nullptr;
 			cuGetErrorName(result, &error);
 			std::cerr << "CUDA error " << error << " (" << (int)result << ") in " << file << ":" << line << " on device " << ctx.dev_id << std::endl;
 			std::cerr << command << std::endl;
 			abort();
 		}
 	}

 	void init(const std::vector<int> &device_ids, std::vector<container> &ctxs) {
 		ctxs.resize(device_ids.size());
 		for (int i = 0; i < device_ids.size(); i++)
 		{
 			auto &ctx = ctxs[i];
 			ctx.dev_id = device_ids[i];

 			CHECK(cuDeviceGet(&ctx.dev, ctx.dev_id));
 			CHECK(cuDevicePrimaryCtxSetFlags(ctx.dev_id, CU_CTX_SCHED_BLOCKING_SYNC));
 			CHECK(cuDevicePrimaryCtxRetain(&ctx.ctx, ctx.dev_id));
 			ctx.hostAllocations.resize(4);
 			ctx.gpuAllocations.resize(std::max((size_t)8, device_ids.size() * 2));
 			CHECK(cuCtxPushCurrent(ctx.ctx));
 			for (auto &host : ctx.hostAllocations)
 			{
 				CHECK(cuMemHostAlloc(&host, MEMCOPY_SIZE, CU_MEMHOSTALLOC_DEVICEMAP));
 			}
 			for (auto &gpu : ctx.gpuAllocations)
 			{
 				CHECK(cuMemAlloc(&gpu, MEMCOPY_SIZE));
 			}
 			CHECK(cuCtxPopCurrent(nullptr));
 		}
 	}

 	void deinit(std::vector<container> &ctxs) {
 		for (auto& ctx : ctxs)
 		{
 			CHECK(cuCtxPushCurrent(ctx.ctx));
 			for (auto &host : ctx.hostAllocations)
 			{
 				CHECK(cuMemFreeHost(host));
 			}
 			for (auto &gpu : ctx.gpuAllocations)
 			{
 				CHECK(cuMemFree(gpu));
 			}
 			CHECK(cuCtxPopCurrent(nullptr));
 			CHECK(cuDevicePrimaryCtxRelease(ctx.dev));
 		}
 		ctxs.resize(0);
 	}

 	float bandwidth(float time) {
 		return (float)MEMCOPY_SIZE * MEMCOPY_ITERATIONS / 1024 / 1024 / 1024 / time * 1000;
 	}
 	void bandwidth_host_to_device(std::vector<container> &ctxs)
 	{
 		std::cout << "Host to device bandwidth test" << std::endl;
 		std::vector<std::future<std::tuple<int, float>>> workers;
 		for (auto &ctx : ctxs)
 		{
 			workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
 				float elapsed;
 				CHECK(cuCtxPushCurrent(ctx.ctx));
 				CUstream stream_up;
 				CUevent start;
 				CUevent stop;
 				CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
 				CHECK(cuEventCreate(&start, 0));
 				CHECK(cuEventCreate(&stop, 0));
 				CHECK(cuEventRecord(start, stream_up));
 				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
 				{
 					CHECK(cuMemcpyHtoDAsync(
 						ctx.gpuAllocations[i % ctx.gpuAllocations.size()], 
 						ctx.hostAllocations[i % ctx.hostAllocations.size()], 
 						MEMCOPY_SIZE, stream_up
 					));
 				}
 				CHECK(cuEventRecord(stop, stream_up));
 				CHECK(cuEventSynchronize(stop));
 				CHECK(cuEventElapsedTime(&elapsed, start, stop));
 				CHECK(cuEventDestroy(start));
 				CHECK(cuEventDestroy(stop));
 				CHECK(cuStreamDestroy(stream_up));
 				CHECK(cuCtxPopCurrent(nullptr));
 				return std::make_tuple(ctx.dev_id, elapsed);
 			}));
 		};
 		for (auto &worker : workers)
 		{
 			int dev_id;
 			float time;
 			std::tie(dev_id, time) = worker.get();
 			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
 		}
 	}
 	void bandwidth_device_to_host(std::vector<container> &ctxs)
 	{
 		std::cout << "Device to host bandwidth test" << std::endl;
 		std::vector<std::future<std::tuple<int, float>>> workers;
 		for (auto &ctx : ctxs)
 		{
 			workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
 				float elapsed;
 				CHECK(cuCtxPushCurrent(ctx.ctx));
 				CUstream stream_down;
 				CUevent start;
 				CUevent stop;
 				CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
 				CHECK(cuEventCreate(&start, 0));
 				CHECK(cuEventCreate(&stop, 0));
 				CHECK(cuEventRecord(start, stream_down));
 				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
 				{
 					CHECK(cuMemcpyDtoHAsync(
 						ctx.hostAllocations[i % ctx.hostAllocations.size()],
 						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
 						MEMCOPY_SIZE, stream_down
 					));
 				}
 				CHECK(cuEventRecord(stop, stream_down));
 				CHECK(cuEventSynchronize(stop));
 				CHECK(cuEventElapsedTime(&elapsed, start, stop));
 				CHECK(cuEventDestroy(start));
 				CHECK(cuEventDestroy(stop));
 				CHECK(cuStreamDestroy(stream_down));
 				CHECK(cuCtxPopCurrent(nullptr));
 				return std::make_tuple(ctx.dev_id, elapsed);
 			}));
 		};
 		for (auto &worker : workers)
 		{
 			int dev_id;
 			float time;
 			std::tie(dev_id, time) = worker.get();
 			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
 		}
 	}

 	void bandwidth_bidrectional(std::vector<container> &ctxs)
 	{
 		std::cout << "Bidirectional multi-stream bandwidth test" << std::endl;
 		std::vector<std::future<std::tuple<int, float>>> workers;
 		for (auto &ctx : ctxs)
 		{
 			workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
 				float elapsed;
 				CHECK(cuCtxPushCurrent(ctx.ctx));
 				CUstream stream_up;
 				CUstream stream_down;
 				CUevent start_up;
 				CUevent start_down;
 				CUevent stop_up;
 				CUevent stop_down;
 				CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
 				CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
 				CHECK(cuEventCreate(&start_up, 0));
 				CHECK(cuEventCreate(&start_down, 0));
 				CHECK(cuEventCreate(&stop_up, 0));
 				CHECK(cuEventCreate(&stop_down, 0));
 				CHECK(cuEventRecord(start_up, stream_up));
 				CHECK(cuEventRecord(start_down, stream_down));
 				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
 				{
 					CHECK(cuMemcpyHtoDAsync(
 						ctx.gpuAllocations[(i * 2) % ctx.gpuAllocations.size()],
 						ctx.hostAllocations[(i * 2) % ctx.hostAllocations.size()],
 						MEMCOPY_SIZE, 
 						stream_up
 					));
 					CHECK(cuMemcpyDtoHAsync(
 						ctx.hostAllocations[(i * 2 + 1) % ctx.hostAllocations.size()],
 						ctx.gpuAllocations[(i * 2 + 1) % ctx.gpuAllocations.size()],
 						MEMCOPY_SIZE,
 						stream_down
 					));
 				}
 				CHECK(cuEventRecord(stop_up, stream_up));
 				CHECK(cuEventRecord(stop_down, stream_down));
 				CHECK(cuEventSynchronize(stop_up));
 				CHECK(cuEventSynchronize(stop_down));
 				{
 					float a, b, c, d;
 					CHECK(cuEventElapsedTime(&a, start_up, stop_up));
 					CHECK(cuEventElapsedTime(&b, start_up, stop_down));
 					CHECK(cuEventElapsedTime(&c, start_down, stop_up));
 					CHECK(cuEventElapsedTime(&d, start_down, stop_down));
 					elapsed = std::max({ a, b, c, d });
 				}
 				CHECK(cuEventDestroy(start_up));
 				CHECK(cuEventDestroy(start_down));
 				CHECK(cuEventDestroy(stop_up));
 				CHECK(cuEventDestroy(stop_down));
 				CHECK(cuStreamDestroy(stream_up));
 				CHECK(cuStreamDestroy(stream_down));
 				CHECK(cuCtxPopCurrent(nullptr));
 				return std::make_tuple(ctx.dev_id, elapsed);
 			}));
 		};
 		for (auto &worker : workers)
 		{
 			int dev_id;
 			float time;
 			std::tie(dev_id, time) = worker.get();
 			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time/2) << "GB/s)" << std::endl;
 		}
 	}

 	void bandwidth_device_to_device_gather(std::vector<container> &ctxs, container &target)
 	{
 		std::cout << "Device to device peer2peer bandwidth test, target GPU " << target.dev_id << std::endl;
 		std::vector<std::future<std::tuple<int, float>>> workers;
 		int index = 0;
 		for (auto &ctx : ctxs)
 		{
 			if (&ctx == &target)
 			{
 				continue;
 			}
 			workers.push_back(std::async(std::launch::async, [target, index, ctx]() -> std::tuple<int, float> {
 				float elapsed;
 				CHECK(cuCtxPushCurrent(ctx.ctx));
 				CUstream stream;
 				CUevent start;
 				CUevent stop;
 				CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 				CHECK(cuEventCreate(&start, 0));
 				CHECK(cuEventCreate(&stop, 0));
 				CHECK(cuEventRecord(start, stream));
 				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
 				{
 					CHECK(cuMemcpyPeerAsync(
 						target.gpuAllocations[(i * index) % target.gpuAllocations.size()],
 						target.ctx,
 						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
 						ctx.ctx,
 						MEMCOPY_SIZE, stream
 					));
 				}
 				CHECK(cuEventRecord(stop, stream));
 				CHECK(cuEventSynchronize(stop));
 				CHECK(cuEventElapsedTime(&elapsed, start, stop));
 				CHECK(cuEventDestroy(start));
 				CHECK(cuEventDestroy(stop));
 				CHECK(cuStreamDestroy(stream));
 				CHECK(cuCtxPopCurrent(nullptr));
 				return std::make_tuple(ctx.dev_id, elapsed);
 			}));
 			index++;
 		};
 		for (auto &worker : workers)
 		{
 			int dev_id;
 			float time;
 			std::tie(dev_id, time) = worker.get();
 			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
 		}
 	}

 	void bandwidth_device_to_device_scatter(std::vector<container> &ctxs, container &source)
 	{
 		std::cout << "Device to device peer2peer bandwidth test, source GPU " << source.dev_id << std::endl;
 		std::vector<std::future<std::tuple<int, float>>> workers;
 		int index = 0;
 		for (auto &ctx : ctxs)
 		{
 			if (&ctx == &source)
 			{
 				continue;
 			}
 			workers.push_back(std::async(std::launch::async, [source, index, ctx]() -> std::tuple<int, float> {
 				float elapsed;
 				CHECK(cuCtxPushCurrent(ctx.ctx));
 				CUevent start;
 				CUevent stop;
 				CUstream stream;
 				CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 				CHECK(cuEventCreate(&start, 0));
 				CHECK(cuEventCreate(&stop, 0));
 				CHECK(cuEventRecord(start, stream));
 				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
 				{
 					CHECK(cuMemcpyPeerAsync(
 						ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
 						ctx.ctx,
 						source.gpuAllocations[i % source.gpuAllocations.size()],
 						source.ctx,
 						MEMCOPY_SIZE, stream
 					));
 				}
 				CHECK(cuEventRecord(stop, stream));
 				CHECK(cuEventSynchronize(stop));
 				CHECK(cuEventElapsedTime(&elapsed, start, stop));
 				CHECK(cuEventDestroy(start));
 				CHECK(cuEventDestroy(stop));
 				CHECK(cuStreamDestroy(stream));
 				CHECK(cuCtxPopCurrent(nullptr));
 				return std::make_tuple(ctx.dev_id, elapsed);
 			}));
 			index++;
 		};
 		for (auto &worker : workers)
 		{
 			int dev_id;
 			float time;
 			std::tie(dev_id, time) = worker.get();
 			std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
 		}
 	}
 	void bandwidth_device_to_device_shift(std::vector<container> &ctxs)
 	{
 		std::cout << "Device to device peer2peer bandwidth test, target = source + 1" << std::endl;
 		std::vector<std::future<std::tuple<int, float>>> workers;
 		for (int i = 0; i < ctxs.size(); i++)
 		{
 			auto &source = ctxs[i];
 			auto &target = ctxs[(i + 1) % ctxs.size()];
 			if (&source == &target)
 			{
 				continue;
 			}
 			workers.push_back(std::async(std::launch::async, [source, target]() -> std::tuple<int, float> {
 				float elapsed;
 				auto &ctx = source;
 				CHECK(cuCtxPushCurrent(ctx.ctx));
 				CUevent start;
 				CUevent stop;
 				CUstream stream;
 				CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 				CHECK(cuEventCreate(&start, 0));
 				CHECK(cuEventCreate(&stop, 0));
 				CHECK(cuEventRecord(start, stream));
 				for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
 				{
 					CHECK(cuMemcpyPeerAsync(
 						target.gpuAllocations[(i * 2 + 1) % target.gpuAllocations.size()],
 						target.ctx,
 						source.gpuAllocations[(i * 2) % source.gpuAllocations.size()],
 						source.ctx,
 						MEMCOPY_SIZE, stream
 					));
 				}
 				CHECK(cuEventRecord(stop, stream));
 				CHECK(cuEventSynchronize(stop));
 				CHECK(cuEventElapsedTime(&elapsed, start, stop));
 				CHECK(cuEventDestroy(start));
 				CHECK(cuEventDestroy(stop));
 				CHECK(cuStreamDestroy(stream));
 				CHECK(cuCtxPopCurrent(nullptr));
 				return std::make_tuple(source.dev_id, elapsed);
 			}));
 		};
 		for (auto &worker : workers)
 		{
 			int dev_id;
 			float time;
 			std::tie(dev_id, time) = worker.get();
 			std::cout << "GPU " << dev_id << " (source) took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
 		}
 	}
 }

 int main(int argc, char** argv)
 {
 	std::vector<int> device_ids;
 	std::vector<test::container> ctxs;
 	if (argc == 1)
 	{
 		std::cout << "usage: " << argv[0] << " deviceID deviceID...\n";
 		std::cout << "defaulting to test all devices\n";
 	}
 	if (cuInit(0) != CUDA_SUCCESS)
 	{
 		std::cout << "cuInit failed, aborting...\n";
 		exit(1);
 	}
 	if(argc > 1)
 	{
 		for (int i = 0; i < argc - 1; i++)
 		{
 			int dev = atoi(argv[i + 1]);
 			CUdevice device;
 			if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
 			{
 				std::cout << "Could not get device " << dev << ", aborting\n";
 				exit(1);
 			}
 			device_ids.push_back(dev);
 		}
 	}
 	else
 	{
 		int deviceCount = 0;
 		cuDeviceGetCount(&deviceCount);
 		for (int dev = 0; dev < deviceCount; dev++)
 		{
 			CUdevice device;
 			if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
 			{
 				std::cout << "Could not get device " << dev << ", aborting\n";
 				exit(1);
 			}
 			device_ids.push_back(dev);
 		}
 	}
 	test::init(device_ids, ctxs);
 	test::bandwidth_host_to_device(ctxs);
 	test::bandwidth_device_to_host(ctxs);
 	test::bandwidth_bidrectional(ctxs);
 	if (ctxs.size() > 2)
 	{
 		for (auto &ctx : ctxs)
 		{
 			test::bandwidth_device_to_device_gather(ctxs, ctx);
 		}
 		for (auto &ctx : ctxs)
 		{
 			test::bandwidth_device_to_device_scatter(ctxs, ctx);
 		}
 	}
 	if (ctxs.size() > 1)
 	{
 		test::bandwidth_device_to_device_shift(ctxs);
 	}
 	test::deinit(ctxs);
 	return 0;
 }
	#include <vector>
	#include <thread>
	#include <future>
	#include <iostream>
	#include <string>
	#include <algorithm>

	#include <cuda.h>
	#pragma comment(lib, "cuda.lib")


	// Scale up to enqueue more load until PCIe breaks
	#define MEMCOPY_ITERATIONS 500
	// Scale up to increase PCIe load without increasing driver overhead
	const size_t MEMCOPY_SIZE = (1 << 27); // 128M

	#define CHECK(expression) test::check(expression, #expression, ctx, __FILE__, __LINE__);

	namespace test {
	class container {
	public:
	std::vector<void*> hostAllocations;
	std::vector<CUdeviceptr> gpuAllocations;
	CUdevice dev;
	CUcontext ctx;
	int dev_id;
	};

	void check(CUresult result, const char* command, const container &ctx, const char* file, int line) {
	if (result != CUDA_SUCCESS)
	{
	static std::mutex cerr_mutex;
	std::lock_guard<std::mutex> lock(cerr_mutex);
	const char* error = nullptr;
	cuGetErrorName(result, &error);
	std::cerr << "CUDA error " << error << " (" << (int)result << ") in " << file << ":" << line << " on device " << ctx.dev_id << std::endl;
	std::cerr << command << std::endl;
	abort();
	}
	}

	void init(const std::vector<int> &device_ids, std::vector<container> &ctxs) {
	ctxs.resize(device_ids.size());
	for (int i = 0; i < device_ids.size(); i++)
	{
	auto &ctx = ctxs[i];
	ctx.dev_id = device_ids[i];

	CHECK(cuDeviceGet(&ctx.dev, ctx.dev_id));
	CHECK(cuDevicePrimaryCtxSetFlags(ctx.dev_id, CU_CTX_SCHED_BLOCKING_SYNC));
	CHECK(cuDevicePrimaryCtxRetain(&ctx.ctx, ctx.dev_id));
	ctx.hostAllocations.resize(4);
	ctx.gpuAllocations.resize(std::max((size_t)8, device_ids.size() * 2));
	CHECK(cuCtxPushCurrent(ctx.ctx));
	for (auto &host : ctx.hostAllocations)
	{
	CHECK(cuMemHostAlloc(&host, MEMCOPY_SIZE, CU_MEMHOSTALLOC_DEVICEMAP));
	}
	for (auto &gpu : ctx.gpuAllocations)
	{
	CHECK(cuMemAlloc(&gpu, MEMCOPY_SIZE));
	}
	CHECK(cuCtxPopCurrent(nullptr));
	}
	}

	void deinit(std::vector<container> &ctxs) {
	for (auto& ctx : ctxs)
	{
	CHECK(cuCtxPushCurrent(ctx.ctx));
	for (auto &host : ctx.hostAllocations)
	{
	CHECK(cuMemFreeHost(host));
	}
	for (auto &gpu : ctx.gpuAllocations)
	{
	CHECK(cuMemFree(gpu));
	}
	CHECK(cuCtxPopCurrent(nullptr));
	CHECK(cuDevicePrimaryCtxRelease(ctx.dev));
	}
	ctxs.resize(0);
	}

	float bandwidth(float time) {
	return (float)MEMCOPY_SIZE * MEMCOPY_ITERATIONS / 1024 / 1024 / 1024 / time * 1000;
	}
	void bandwidth_host_to_device(std::vector<container> &ctxs)
	{
	std::cout << "Host to device bandwidth test" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (auto &ctx : ctxs)
	{
	workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream_up;
	CUevent start;
	CUevent stop;
	CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream_up));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyHtoDAsync(
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	ctx.hostAllocations[i % ctx.hostAllocations.size()],
	MEMCOPY_SIZE, stream_up
	));
	}
	CHECK(cuEventRecord(stop, stream_up));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream_up));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}
	void bandwidth_device_to_host(std::vector<container> &ctxs)
	{
	std::cout << "Device to host bandwidth test" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (auto &ctx : ctxs)
	{
	workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream_down;
	CUevent start;
	CUevent stop;
	CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream_down));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyDtoHAsync(
	ctx.hostAllocations[i % ctx.hostAllocations.size()],
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	MEMCOPY_SIZE, stream_down
	));
	}
	CHECK(cuEventRecord(stop, stream_down));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream_down));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}

	void bandwidth_bidrectional(std::vector<container> &ctxs)
	{
	std::cout << "Bidirectional multi-stream bandwidth test" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (auto &ctx : ctxs)
	{
	workers.push_back(std::async(std::launch::async, [ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream_up;
	CUstream stream_down;
	CUevent start_up;
	CUevent start_down;
	CUevent stop_up;
	CUevent stop_down;
	CHECK(cuStreamCreate(&stream_up, CU_STREAM_NON_BLOCKING));
	CHECK(cuStreamCreate(&stream_down, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start_up, 0));
	CHECK(cuEventCreate(&start_down, 0));
	CHECK(cuEventCreate(&stop_up, 0));
	CHECK(cuEventCreate(&stop_down, 0));
	CHECK(cuEventRecord(start_up, stream_up));
	CHECK(cuEventRecord(start_down, stream_down));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyHtoDAsync(
	ctx.gpuAllocations[(i * 2) % ctx.gpuAllocations.size()],
	ctx.hostAllocations[(i * 2) % ctx.hostAllocations.size()],
	MEMCOPY_SIZE,
	stream_up
	));
	CHECK(cuMemcpyDtoHAsync(
	ctx.hostAllocations[(i * 2 + 1) % ctx.hostAllocations.size()],
	ctx.gpuAllocations[(i * 2 + 1) % ctx.gpuAllocations.size()],
	MEMCOPY_SIZE,
	stream_down
	));
	}
	CHECK(cuEventRecord(stop_up, stream_up));
	CHECK(cuEventRecord(stop_down, stream_down));
	CHECK(cuEventSynchronize(stop_up));
	CHECK(cuEventSynchronize(stop_down));
	{
	float a, b, c, d;
	CHECK(cuEventElapsedTime(&a, start_up, stop_up));
	CHECK(cuEventElapsedTime(&b, start_up, stop_down));
	CHECK(cuEventElapsedTime(&c, start_down, stop_up));
	CHECK(cuEventElapsedTime(&d, start_down, stop_down));
	elapsed = std::max({ a, b, c, d });
	}
	CHECK(cuEventDestroy(start_up));
	CHECK(cuEventDestroy(start_down));
	CHECK(cuEventDestroy(stop_up));
	CHECK(cuEventDestroy(stop_down));
	CHECK(cuStreamDestroy(stream_up));
	CHECK(cuStreamDestroy(stream_down));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time/2) << "GB/s)" << std::endl;
	}
	}

	void bandwidth_device_to_device_gather(std::vector<container> &ctxs, container &target)
	{
	std::cout << "Device to device peer2peer bandwidth test, target GPU " << target.dev_id << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	int index = 0;
	for (auto &ctx : ctxs)
	{
	if (&ctx == &target)
	{
	continue;
	}
	workers.push_back(std::async(std::launch::async, [target, index, ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUstream stream;
	CUevent start;
	CUevent stop;
	CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyPeerAsync(
	target.gpuAllocations[(i * index) % target.gpuAllocations.size()],
	target.ctx,
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	ctx.ctx,
	MEMCOPY_SIZE, stream
	));
	}
	CHECK(cuEventRecord(stop, stream));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	index++;
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}

	void bandwidth_device_to_device_scatter(std::vector<container> &ctxs, container &source)
	{
	std::cout << "Device to device peer2peer bandwidth test, source GPU " << source.dev_id << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	int index = 0;
	for (auto &ctx : ctxs)
	{
	if (&ctx == &source)
	{
	continue;
	}
	workers.push_back(std::async(std::launch::async, [source, index, ctx]() -> std::tuple<int, float> {
	float elapsed;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUevent start;
	CUevent stop;
	CUstream stream;
	CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyPeerAsync(
	ctx.gpuAllocations[i % ctx.gpuAllocations.size()],
	ctx.ctx,
	source.gpuAllocations[i % source.gpuAllocations.size()],
	source.ctx,
	MEMCOPY_SIZE, stream
	));
	}
	CHECK(cuEventRecord(stop, stream));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(ctx.dev_id, elapsed);
	}));
	index++;
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}
	void bandwidth_device_to_device_shift(std::vector<container> &ctxs)
	{
	std::cout << "Device to device peer2peer bandwidth test, target = source + 1" << std::endl;
	std::vector<std::future<std::tuple<int, float>>> workers;
	for (int i = 0; i < ctxs.size(); i++)
	{
	auto &source = ctxs[i];
	auto &target = ctxs[(i + 1) % ctxs.size()];
	if (&source == &target)
	{
	continue;
	}
	workers.push_back(std::async(std::launch::async, [source, target]() -> std::tuple<int, float> {
	float elapsed;
	auto &ctx = source;
	CHECK(cuCtxPushCurrent(ctx.ctx));
	CUevent start;
	CUevent stop;
	CUstream stream;
	CHECK(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
	CHECK(cuEventCreate(&start, 0));
	CHECK(cuEventCreate(&stop, 0));
	CHECK(cuEventRecord(start, stream));
	for (int i = 0; i < MEMCOPY_ITERATIONS; i++)
	{
	CHECK(cuMemcpyPeerAsync(
	target.gpuAllocations[(i * 2 + 1) % target.gpuAllocations.size()],
	target.ctx,
	source.gpuAllocations[(i * 2) % source.gpuAllocations.size()],
	source.ctx,
	MEMCOPY_SIZE, stream
	));
	}
	CHECK(cuEventRecord(stop, stream));
	CHECK(cuEventSynchronize(stop));
	CHECK(cuEventElapsedTime(&elapsed, start, stop));
	CHECK(cuEventDestroy(start));
	CHECK(cuEventDestroy(stop));
	CHECK(cuStreamDestroy(stream));
	CHECK(cuCtxPopCurrent(nullptr));
	return std::make_tuple(source.dev_id, elapsed);
	}));
	};
	for (auto &worker : workers)
	{
	int dev_id;
	float time;
	std::tie(dev_id, time) = worker.get();
	std::cout << "GPU " << dev_id << " (source) took " << time << "ms (" << bandwidth(time) << "GB/s)" << std::endl;
	}
	}
	}

	int main(int argc, char** argv)
	{
	std::vector<int> device_ids;
	std::vector<test::container> ctxs;
	if (argc == 1)
	{
	std::cout << "usage: " << argv[0] << " deviceID deviceID...\n";
	std::cout << "defaulting to test all devices\n";
	}
	if (cuInit(0) != CUDA_SUCCESS)
	{
	std::cout << "cuInit failed, aborting...\n";
	exit(1);
	}
	if(argc > 1)
	{
	for (int i = 0; i < argc - 1; i++)
	{
	int dev = atoi(argv[i + 1]);
	CUdevice device;
	if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
	{
	std::cout << "Could not get device " << dev << ", aborting\n";
	exit(1);
	}
	device_ids.push_back(dev);
	}
	}
	else
	{
	int deviceCount = 0;
	cuDeviceGetCount(&deviceCount);
	for (int dev = 0; dev < deviceCount; dev++)
	{
	CUdevice device;
	if (cuDeviceGet(&device, dev) != CUDA_SUCCESS)
	{
	std::cout << "Could not get device " << dev << ", aborting\n";
	exit(1);
	}
	device_ids.push_back(dev);
	}
	}
	test::init(device_ids, ctxs);
	test::bandwidth_host_to_device(ctxs);
	test::bandwidth_device_to_host(ctxs);
	test::bandwidth_bidrectional(ctxs);
	if (ctxs.size() > 2)
	{
	for (auto &ctx : ctxs)
	{
	test::bandwidth_device_to_device_gather(ctxs, ctx);
	}
	for (auto &ctx : ctxs)
	{
	test::bandwidth_device_to_device_scatter(ctxs, ctx);
	}
	}
	if (ctxs.size() > 1)
	{
	test::bandwidth_device_to_device_shift(ctxs);
	}
	test::deinit(ctxs);
	return 0;
	}