Created
September 20, 2024 07:52
-
-
Save yashi/31c0fbc9d56b44712a0ce5503715ad92 to your computer and use it in GitHub Desktop.
OpenCL Hello World with ROCm specific options
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <CL/cl.h> | |
#define VECTOR_SIZE 1024 | |
// OpenCL kernel which is run for every work item created. | |
const char *saxpy_kernel = | |
"__kernel \n" | |
"void saxpy_kernel(float alpha, \n" | |
" __global float *A, \n" | |
" __global float *B, \n" | |
" __global float *C) \n" | |
"{ \n" | |
" // Get the index of the work-item \n" | |
" int index = get_global_id(0); \n" | |
" C[index] = alpha * A[index] + B[index]; \n" | |
"} \n"; | |
int main(void) | |
{ | |
int i; | |
cl_int clStatus; | |
// Allocate and initialize pointers to NULL | |
float alpha = 2.0f; | |
float *A = NULL; | |
float *B = NULL; | |
float *C = NULL; | |
cl_platform_id *platforms = NULL; | |
cl_uint num_platforms = 0; | |
cl_device_id *device_list = NULL; | |
cl_uint num_devices = 0; | |
cl_context context = NULL; | |
cl_command_queue command_queue = NULL; | |
cl_mem A_clmem = NULL; | |
cl_mem B_clmem = NULL; | |
cl_mem C_clmem = NULL; | |
cl_program program = NULL; | |
cl_kernel kernel = NULL; | |
// Allocate space for vectors A, B, and C | |
A = (float*)malloc(sizeof(float) * VECTOR_SIZE); | |
B = (float*)malloc(sizeof(float) * VECTOR_SIZE); | |
C = (float*)malloc(sizeof(float) * VECTOR_SIZE); | |
// Check for successful memory allocation | |
if (!A || !B || !C) { | |
fprintf(stderr, "Failed to allocate host memory.\n"); | |
goto cleanup; | |
} | |
// Initialize vectors | |
for(i = 0; i < VECTOR_SIZE; i++) { | |
A[i] = (float)i; | |
B[i] = (float)(VECTOR_SIZE - i); | |
C[i] = 0.0f; | |
} | |
// Get platform and device information | |
// Retrieve the number of platforms | |
clStatus = clGetPlatformIDs(0, NULL, &num_platforms); | |
if (clStatus != CL_SUCCESS || num_platforms == 0) { | |
fprintf(stderr, "Failed to find any OpenCL platforms.\n"); | |
goto cleanup; | |
} | |
// Allocate memory for platform IDs | |
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms); | |
if (!platforms) { | |
fprintf(stderr, "Failed to allocate memory for platform IDs.\n"); | |
goto cleanup; | |
} | |
// Retrieve the platform IDs | |
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to get the platform IDs.\n"); | |
goto cleanup; | |
} | |
// Get the devices list and choose the device you want to run on | |
// Retrieve the number of GPU devices | |
clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); | |
if (clStatus != CL_SUCCESS || num_devices == 0) { | |
fprintf(stderr, "Failed to find any GPU devices.\n"); | |
goto cleanup; | |
} | |
// Allocate memory for device IDs | |
device_list = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices); | |
if (!device_list) { | |
fprintf(stderr, "Failed to allocate memory for device IDs.\n"); | |
goto cleanup; | |
} | |
// Retrieve the device IDs | |
clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to get the device IDs.\n"); | |
goto cleanup; | |
} | |
// Create an OpenCL context for the selected device | |
context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create an OpenCL context.\n"); | |
goto cleanup; | |
} | |
// Create a command queue using clCreateCommandQueueWithProperties | |
cl_queue_properties properties[] = {0}; // No specific properties | |
command_queue = clCreateCommandQueueWithProperties(context, device_list[0], properties, &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create a command queue.\n"); | |
goto cleanup; | |
} | |
// Create memory buffers on the device for each vector | |
A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create buffer for vector A.\n"); | |
goto cleanup; | |
} | |
B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create buffer for vector B.\n"); | |
goto cleanup; | |
} | |
C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create buffer for vector C.\n"); | |
goto cleanup; | |
} | |
// Copy the Buffer A and B to the device | |
clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to write data to buffer A.\n"); | |
goto cleanup; | |
} | |
clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to write data to buffer B.\n"); | |
goto cleanup; | |
} | |
// Create a program from the kernel source | |
program = clCreateProgramWithSource(context, 1, &saxpy_kernel, NULL, &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create program from source.\n"); | |
goto cleanup; | |
} | |
// Build the program | |
const char *build_options = "-time -buildlog=stderr"; | |
clStatus = clBuildProgram(program, 1, device_list, build_options, NULL, NULL); | |
if (clStatus != CL_SUCCESS) { | |
// Determine the reason for the error | |
size_t log_size; | |
clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); | |
char *log = (char*)malloc(log_size); | |
if (log) { | |
clGetProgramBuildInfo(program, device_list[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL); | |
fprintf(stderr, "Error in kernel:\n%s\n", log); | |
free(log); | |
} else { | |
fprintf(stderr, "Failed to allocate memory for build log.\n"); | |
} | |
goto cleanup; | |
} | |
// Create the OpenCL kernel | |
kernel = clCreateKernel(program, "saxpy_kernel", &clStatus); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to create kernel.\n"); | |
goto cleanup; | |
} | |
// Set the arguments of the kernel | |
clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)&alpha); | |
clStatus |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&A_clmem); | |
clStatus |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&B_clmem); | |
clStatus |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&C_clmem); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to set kernel arguments.\n"); | |
goto cleanup; | |
} | |
// Execute the OpenCL kernel on the list | |
size_t global_size = VECTOR_SIZE; // Process the entire lists | |
size_t local_size = 64; // Process in work-groups of size 64 | |
// Ensure that VECTOR_SIZE is divisible by local_size | |
if (VECTOR_SIZE % local_size != 0) { | |
fprintf(stderr, "VECTOR_SIZE (%d) is not divisible by local_size (%zu).\n", VECTOR_SIZE, local_size); | |
goto cleanup; | |
} | |
clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to enqueue NDRange kernel.\n"); | |
goto cleanup; | |
} | |
// Read the cl memory C_clmem on device to the host variable C | |
clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to read buffer C from device.\n"); | |
goto cleanup; | |
} | |
// Clean up and wait for all the commands to complete. | |
clStatus = clFlush(command_queue); | |
clStatus |= clFinish(command_queue); | |
if (clStatus != CL_SUCCESS) { | |
fprintf(stderr, "Failed to flush or finish the command queue.\n"); | |
// Proceeding to clean up resources even if this fails | |
} | |
// Display the result to the screen | |
for(i = 0; i < VECTOR_SIZE; i++) { | |
printf("%f * %f + %f = %f\n", alpha, A[i], B[i], C[i]); | |
} | |
cleanup: | |
// Release OpenCL resources | |
if (kernel) clReleaseKernel(kernel); | |
if (program) clReleaseProgram(program); | |
if (C_clmem) clReleaseMemObject(C_clmem); | |
if (B_clmem) clReleaseMemObject(B_clmem); | |
if (A_clmem) clReleaseMemObject(A_clmem); | |
if (command_queue) clReleaseCommandQueue(command_queue); | |
if (context) clReleaseContext(context); | |
if (device_list) free(device_list); | |
if (platforms) free(platforms); | |
// Free host memory | |
if (A) free(A); | |
if (B) free(B); | |
if (C) free(C); | |
return (clStatus == CL_SUCCESS) ? EXIT_SUCCESS : EXIT_FAILURE; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment