Created
July 26, 2016 14:38
-
-
Save psteinb/4352b1c9b809f8fbbd5edc4e2c805eb6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Vector addition: C = A + B. | |
* | |
* This sample is a very basic sample that implements element by element | |
* vector addition. It is the same as the sample illustrating Chapter 2 | |
* of the programming guide with some additions like error checking. | |
*/ | |
#include <stdio.h> | |
// For the CUDA runtime routines (prefixed with "cuda_") | |
#include <cuda_runtime.h> | |
#include <cuda_profiler_api.h> | |
#include <helper_cuda.h> | |
/** | |
* CUDA Kernel Device code | |
* | |
* Computes the vector addition of A and B into C. The 3 vectors have the same | |
* number of elements numElements. | |
*/ | |
__global__ void | |
vectorAdd(const double *A, const double *B, double *C, int numElements) | |
{ | |
int i = blockDim.x * blockIdx.x + threadIdx.x; | |
if (i < numElements) | |
{ | |
C[i] = A[i] + B[i]; | |
} | |
} | |
/** | |
* Host main routine | |
*/ | |
int | |
main(void) | |
{ | |
// Error code to check return values for CUDA calls | |
cudaError_t err = cudaSuccess; | |
// Print the vector length to be used, and compute its size | |
int numElements = 67108864; | |
size_t size = numElements * sizeof(double); | |
printf("[Vector addition of %d elements]\n", numElements); | |
// Allocate the host input vector A | |
double *h_A = (double *)malloc(size); | |
// Allocate the host input vector B | |
double *h_B = (double *)malloc(size); | |
// Allocate the host output vector C | |
double *h_C = (double *)malloc(size); | |
// Verify that allocations succeeded | |
if (h_A == NULL || h_B == NULL || h_C == NULL) | |
{ | |
fprintf(stderr, "Failed to allocate host vectors!\n"); | |
exit(EXIT_FAILURE); | |
} | |
// Initialize the host input vectors | |
for (int i = 0; i < numElements; ++i) | |
{ | |
h_A[i] = rand()/(double)RAND_MAX; | |
h_B[i] = rand()/(double)RAND_MAX; | |
} | |
// Allocate the device input vector A | |
double *d_A = NULL; | |
err = cudaMalloc((void **)&d_A, size); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Allocate the device input vector B | |
double *d_B = NULL; | |
err = cudaMalloc((void **)&d_B, size); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Allocate the device output vector C | |
double *d_C = NULL; | |
err = cudaMalloc((void **)&d_C, size); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Copy the host input vectors A and B in host memory to the device input vectors in | |
// device memory | |
printf("Copy input data from the host memory to the CUDA device\n"); | |
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Launch the Vector Add CUDA Kernel | |
// int threadsPerBlock = 256; | |
int threadsPerBlock = 1024; | |
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; | |
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); | |
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); | |
err = cudaGetLastError(); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Copy the device result vector in device memory to the host result vector | |
// in host memory. | |
printf("Copy output data from the CUDA device to the host memory\n"); | |
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Verify that the result vector is correct | |
for (int i = 0; i < numElements; ++i) | |
{ | |
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) | |
{ | |
fprintf(stderr, "Result verification failed at element %d!\n", i); | |
exit(EXIT_FAILURE); | |
} | |
} | |
printf("Test PASSED\n"); | |
// Free device global memory | |
err = cudaFree(d_A); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
err = cudaFree(d_B); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
err = cudaFree(d_C); | |
if (err != cudaSuccess) | |
{ | |
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); | |
exit(EXIT_FAILURE); | |
} | |
// Free host memory | |
free(h_A); | |
free(h_B); | |
free(h_C); | |
// Calling cudaProfilerStop causes all profile data to be | |
// flushed before the application exits | |
checkCudaErrors(cudaProfilerStop()); | |
printf("Done\n"); | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment