Created
February 28, 2023 04:15
-
-
Save danieloneill/9845077a1fe498e2272f6248eceac88f to your computer and use it in GitHub Desktop.
(C) Benchmark for vertically flipping a 320x240x24bpp image using different CPU and GPU methods, Linux/SDL/CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MODE_GPU 0 | |
#define MODE_GPU_MEMCPY 1 | |
#define MODE_CPU 2 | |
#define MODE_CPU_MEMCPY 3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: | |
/usr/local/cuda-12.0/bin/nvcc -o readbmp readbmp.cu sdltest.cu -g `sdl2-config --cflags --libs` -lcuda |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdbool.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <unistd.h> | |
#include <string.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include "bleh.h" | |
typedef struct { | |
int w; | |
int h; | |
uint8_t *data; | |
size_t length; | |
} BMP; | |
int readbmp(const char *path, BMP *bmp) | |
{ | |
memset(bmp, 0, sizeof(BMP)); | |
int fd = open(path, 0); | |
if( -1 == fd ) | |
return -1; | |
struct { | |
char hf[2]; | |
uint8_t idgaf[8]; | |
uint32_t offset __attribute__((packed)); | |
} bmpheader; | |
struct { | |
uint32_t hlen; | |
int32_t width; | |
int32_t height; | |
uint16_t planes; | |
uint16_t bpp; | |
uint32_t comp; | |
uint32_t imgsize; | |
int32_t hres; | |
int32_t yres; | |
uint32_t palcolcount; | |
uint32_t impcols; | |
} dibheader; | |
if( 14 > read(fd, &bmpheader, 14 ) ) | |
return -3; | |
if( 40 > read(fd, &dibheader, 40 ) ) | |
return -4; | |
printf("%dx%d @ %dbpp (%d) @%x\n", dibheader.width, dibheader.height, dibheader.bpp, dibheader.comp, bmpheader.offset); | |
if( dibheader.comp != 0 || dibheader.bpp != 24 ) | |
{ | |
fprintf(stderr, "I only read Uncompressed 24bpp BMPs, no alpha, huffman, or RLE.\n"); | |
close(fd); | |
return -2; | |
} | |
size_t dlen = dibheader.width * dibheader.height * 3; | |
uint8_t *data = (uint8_t *)malloc(dlen); | |
read(fd, data, dlen); | |
bmp->w = dibheader.width; | |
bmp->h = dibheader.height; | |
bmp->data = data; | |
bmp->length = dlen; | |
close(fd); | |
return 0; | |
} | |
bool setupSDL(int w, int h); | |
bool renderSDL(); | |
void shutdownSDL(); | |
void blitData(const uint8_t *data, int useMode); | |
int usage(char *pname) | |
{ | |
fprintf(stderr, "Usage: %s <-c|-C|-g|-G>\n\t-c - CPU, interpolation method\n\t-C - CPU, memcpy method\n\t-g - CUDA, interpolation method\n\t-G - CUDA, memcpy method\n\n", pname); | |
return -1; | |
} | |
int main(int argc, char **argv) | |
{ | |
int useMode = MODE_CPU; | |
if( argc > 1 ) | |
{ | |
if( strcmp(argv[1], "-c") == 0 ) | |
useMode = MODE_CPU; | |
else if( strcmp(argv[1], "-C") == 0 ) | |
useMode = MODE_CPU_MEMCPY; | |
else if( strcmp(argv[1], "-g") == 0 ) | |
useMode = MODE_GPU; | |
else if( strcmp(argv[1], "-G") == 0 ) | |
useMode = MODE_GPU_MEMCPY; | |
else | |
return usage(argv[0]); | |
} else | |
return usage(argv[0]); | |
BMP bmp; | |
int rval = readbmp("test.bmp", &bmp); | |
if( rval != 0 ) | |
return rval; | |
if( !setupSDL(bmp.w, bmp.h) ) | |
return -4; | |
while( true == renderSDL() ) { | |
blitData(bmp.data, useMode); | |
} | |
if( 0 != bmp.data ) | |
free(bmp.data); | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
#include <sys/time.h> | |
#include <SDL.h> | |
#include <cuda_runtime.h> | |
#include "bleh.h" | |
bool Running = true; | |
SDL_Window* Window = NULL; | |
SDL_Surface* PrimarySurface = NULL; | |
SDL_Surface *imageSurface; | |
uint8_t *cudaSrc = NULL; | |
uint8_t *cudaDst = NULL; | |
void timespec_diff(struct timespec *start, struct timespec *stop, | |
struct timespec *result) | |
{ | |
if ((stop->tv_nsec - start->tv_nsec) < 0) { | |
result->tv_sec = stop->tv_sec - start->tv_sec - 1; | |
result->tv_nsec = stop->tv_nsec - start->tv_nsec + 1000000000; | |
} else { | |
result->tv_sec = stop->tv_sec - start->tv_sec; | |
result->tv_nsec = stop->tv_nsec - start->tv_nsec; | |
} | |
return; | |
} | |
#define COPYCLASS float | |
__device__ void devCpyCplx(const COPYCLASS *in, COPYCLASS *out, int len) | |
{ | |
for (int i=0; i < len/sizeof(COPYCLASS); ++i) | |
out[i] = in[i]; | |
} | |
__global__ void flipImage(uint8_t *src, uint8_t *dest, size_t stride, int rowCount, bool mc) | |
{ | |
// Row # is threadIdx... | |
if( threadIdx.x > rowCount ) | |
return; | |
int srcRow = threadIdx.x; | |
int dstRow = rowCount - threadIdx.x; | |
uint8_t *sPtr = src + (stride * srcRow); | |
uint8_t *dPtr = dest + (stride * dstRow); | |
if( false == mc ) | |
devCpyCplx((COPYCLASS*)sPtr, (COPYCLASS*)dPtr, stride); | |
else | |
memcpy(dPtr, sPtr, stride); | |
} | |
bool setupSDL(int w, int h) | |
{ | |
if(SDL_Init(SDL_INIT_VIDEO) < 0) { | |
fprintf(stderr, "Unable to Init SDL: %s\n", SDL_GetError()); | |
return false; | |
} | |
if(!SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "1")) { | |
fprintf(stderr, "Unable to Init hinting: %s\n", SDL_GetError()); | |
} | |
if((Window = SDL_CreateWindow( | |
"SDL", | |
SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, | |
w, h, SDL_WINDOW_SHOWN) | |
) == NULL) { | |
fprintf(stderr, "Unable to create SDL Window: %s\n", SDL_GetError()); | |
return false; | |
} | |
PrimarySurface = SDL_GetWindowSurface( Window ); | |
if( !PrimarySurface ) | |
{ | |
fprintf(stderr, "Failed to get window surface.\n"); | |
return false; | |
} | |
Running = true; | |
return true; | |
} | |
bool setupCuda(int w, int h) | |
{ | |
if( !cudaSrc ) | |
cudaMalloc(&cudaSrc, w * h * 3); | |
if( !cudaDst ) | |
cudaMalloc(&cudaDst, w * h * 3); | |
return cudaSrc && cudaDst; | |
} | |
void freeCuda() | |
{ | |
if( cudaSrc ) | |
cudaFree(cudaSrc); | |
if( cudaDst ) | |
cudaFree(cudaDst); | |
cudaSrc = NULL; | |
cudaDst = NULL; | |
} | |
void blitDataCuda(const uint8_t *data, bool mc) | |
{ | |
size_t imgSize = imageSurface->w * imageSurface->h * 3; | |
bool rval = setupCuda(imageSurface->w, imageSurface->h); | |
cudaMemcpy(cudaSrc, data, imgSize, cudaMemcpyHostToDevice); | |
flipImage <<< 1, imageSurface->h >>> (cudaSrc, cudaDst, imageSurface->w * 3, imageSurface->h, mc); | |
cudaDeviceSynchronize(); | |
SDL_LockSurface(imageSurface); | |
cudaMemcpy(imageSurface->pixels, cudaDst, imgSize, cudaMemcpyDeviceToHost); | |
SDL_UnlockSurface(imageSurface); | |
} | |
void blitDataCPU(const COPYCLASS *data) | |
{ | |
COPYCLASS *dst = (COPYCLASS *)imageSurface->pixels; | |
int stride = imageSurface->w * 3 / sizeof(COPYCLASS); | |
SDL_LockSurface(imageSurface); | |
for( int y=0; y < imageSurface->h; y++ ) | |
{ | |
int invy = imageSurface->h - y - 1; | |
for( int x=0; x < stride; x++ ) | |
{ | |
uint32_t dpos = invy * stride + x; | |
uint32_t spos = y * stride + x; | |
dst[dpos] = data[spos]; | |
} | |
} | |
SDL_UnlockSurface(imageSurface); | |
} | |
void blitDataCPUMemcpy(const uint8_t *data) | |
{ | |
uint8_t *dst = (uint8_t *)imageSurface->pixels; | |
int stride = imageSurface->w * 3; | |
SDL_LockSurface(imageSurface); | |
for( int y=0; y < imageSurface->h; y++ ) | |
{ | |
int invy = imageSurface->h - y - 1; | |
memcpy( dst + (stride * invy), data + (stride * y), stride ); | |
} | |
SDL_UnlockSurface(imageSurface); | |
} | |
void blitData(const uint8_t *data, int useMode) | |
{ | |
if( !imageSurface ) | |
{ | |
int w, h; | |
SDL_GetWindowSize(Window, &w, &h); | |
imageSurface = SDL_CreateRGBSurface(0, w, h, 24, 0xFF0000, 0x00FF00, 0x0000FF, 0x000000); | |
} | |
if( !imageSurface ) | |
return; | |
struct timespec beforeBlit, afterBlit, blitDiff; | |
clock_gettime(CLOCK_REALTIME, &beforeBlit); | |
bool rval = (useMode == MODE_GPU || useMode == MODE_GPU_MEMCPY) ? setupCuda(imageSurface->w, imageSurface->h) : false; | |
if( rval == true && useMode == MODE_GPU ) | |
blitDataCuda(data, false); | |
else if( rval == true && useMode == MODE_GPU_MEMCPY ) | |
blitDataCuda(data, true); | |
else if( useMode == MODE_CPU_MEMCPY ) | |
blitDataCPUMemcpy(data); | |
else if( useMode == MODE_CPU ) | |
blitDataCPU((COPYCLASS *)data); | |
else | |
printf("No copy method available.\n"); | |
clock_gettime(CLOCK_REALTIME, &afterBlit); | |
timespec_diff(&beforeBlit, &afterBlit, &blitDiff); | |
const char *target = (useMode == MODE_CPU || useMode == MODE_CPU_MEMCPY) ? "CPU" : "GPU"; | |
const char *u_memcpy = (useMode == MODE_CPU_MEMCPY || useMode == MODE_GPU_MEMCPY) ? "Memcpy" : "Interp"; | |
printf("Flip (%s %s): %ld.%09lds\n", target, u_memcpy, blitDiff.tv_sec, blitDiff.tv_nsec); | |
if( PrimarySurface ) | |
SDL_BlitSurface(imageSurface, NULL, PrimarySurface, NULL); | |
SDL_UpdateWindowSurface(Window); | |
} | |
void shutdownSDL() | |
{ | |
if(Window) { | |
SDL_DestroyWindow(Window); | |
Window = NULL; | |
} | |
if( imageSurface ) | |
{ | |
SDL_FreeSurface(imageSurface); | |
imageSurface = NULL; | |
} | |
SDL_Quit(); | |
} | |
bool renderSDL() | |
{ | |
SDL_Event Event; | |
while(SDL_PollEvent(&Event) != 0) { | |
if(Event.type == SDL_QUIT) | |
{ | |
fprintf(stderr, "Quit\n"); | |
Running = false; | |
} | |
} | |
if( Running ) | |
SDL_Delay(1); | |
else { | |
shutdownSDL(); | |
freeCuda(); | |
} | |
return Running; | |
} | |
uint16_t ConvertRGB888toRGB565(uint32_t sourceColor) | |
{ | |
unsigned int red = (sourceColor & 0x00FF0000) >> 16; | |
unsigned int green = (sourceColor & 0x0000FF00) >> 8; | |
unsigned int blue = sourceColor & 0x000000FF; | |
return (red >> 3 << 11) + (green >> 2 << 5) + (blue >> 3); | |
} | |
#if 0 | |
int main() | |
{ | |
uint32_t argb32 = 0xFFB57FE3; | |
uint8_t r8 = (argb32 & 0x00FF0000) >> 16; // 0xB5; | |
uint8_t g8 = (argb32 & 0x0000FF00) >> 8; // 0x7F; | |
uint8_t b8 = (argb32 & 0x000000FF); // 0xE3; | |
uint16_t rgb16; | |
uint8_t r5, g6, b5; | |
r5 = r8 >> 3; | |
g6 = g8 >> 2; | |
b5 = b8 >> 3; | |
rgb16 = (r5 << 11); | |
rgb16 += (g6 << 5); | |
rgb16 += b5; | |
printf("RGB888 0x%x%x%x/%u,%u,%u => RGB16 %d/%d/%d => %u/%04x\n", r8, g8, b8, r8, g8, b8, r5, g6, b5, rgb16, rgb16); | |
printf("RGB888 0x%x%x%x => RGB16 %u\n", r8, g8, b8, ConvertRGB888toRGB565( 0xB57FE3 ) ); | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment