Created
December 26, 2015 23:32
-
-
Save nkurz/9a0ed5a9a6e591019b8e to your computer and use it in GitHub Desktop.
Are sustained loads of 64B per cycle possible on Haswell and Skylake?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -fno-inline -std=gnu99 -Wall -O3 -g -march=native l1d.c -o l1d | |
#include <sys/types.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <x86intrin.h> | |
#include <math.h> | |
#ifndef SIZE | |
#define SIZE 4096 | |
#endif | |
#ifndef REPEAT | |
#define REPEAT 100000 | |
#endif | |
#define RDTSC_START(cycles) \ | |
do { \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile("cpuid\n\t" \ | |
"rdtsc\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
: "=r" (cyc_high), "=r" (cyc_low) \ | |
:: "%rax", "%rbx", "%rcx", "%rdx"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
} while (0) | |
#define RDTSC_FINAL(cycles) \ | |
do { \ | |
register unsigned cyc_high, cyc_low; \ | |
__asm volatile("rdtscp\n\t" \ | |
"mov %%edx, %0\n\t" \ | |
"mov %%eax, %1\n\t" \ | |
"cpuid\n\t" \ | |
: "=r" (cyc_high), "=r" (cyc_low) \ | |
:: "%rax", "%rbx", "%rcx", "%rdx"); \ | |
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ | |
} while(0) | |
#define BEST_TIME(test, answer) \ | |
do { \ | |
printf("%s: ", #test); \ | |
fflush(NULL); \ | |
uint64_t cycles_start, cycles_final, cycles_diff; \ | |
uint64_t min_diff = (uint64_t) -1; \ | |
int wrong_answer = 0; \ | |
for (int i = 0; i < REPEAT; i++) { \ | |
__asm volatile (""::: /* pretend to clobber */ "memory"); \ | |
RDTSC_START(cycles_start); \ | |
float result = test; \ | |
RDTSC_FINAL(cycles_final); \ | |
if (result != answer) wrong_answer = 1; \ | |
cycles_diff = (cycles_final - cycles_start); \ | |
if (cycles_diff < min_diff) min_diff = cycles_diff; \ | |
} \ | |
float cycles_per_input = min_diff / (float) (SIZE); \ | |
printf(" %.2f cycles/input", cycles_per_input); \ | |
if (wrong_answer) printf(" [ERROR]"); \ | |
printf("\n"); \ | |
fflush(NULL); \ | |
} while (0) | |
#define VEC_LOAD_OFFSET_BASE(load, offset, base) \ | |
__asm volatile ("vmovups %c1(%2), %0": \ | |
"=x" (load): /* xmm or ymm destination register */ \ | |
"i" (offset), /* constant array offset in bytes */ \ | |
"r" (base) /* read only memory location */ \ | |
) | |
#define VEC_FMA_SUM_MULT_OFFSET_BASE(sum, mult, offset, base) \ | |
__asm volatile ("vfmadd231ps %c2(%3), %1, %0": \ | |
"+x" (sum): /* sum = sum + (mult * [mem]) */ \ | |
"x" (mult), /* xmm or ymm vec of floats */ \ | |
"i" (offset), /* constant array offset in bytes */ \ | |
"r" (base) /* read only memory location */ \ | |
) | |
typedef __m256 ymm_t; | |
float calc_simple(float *array1, float *array2, size_t size) { | |
float total = 0.0; | |
for (size_t i = 0; i < size; i++) { | |
float sum = array1[i] * array2[i]; | |
total += sum; | |
} | |
return total; | |
} | |
float calc_fma(float *array1, float *array2, size_t size) { | |
ymm_t sum1 = {0, 0}; | |
ymm_t sum2 = {0, 0}; | |
ymm_t sum3 = {0, 0}; | |
ymm_t sum4 = {0, 0}; | |
if (size % 32 != 0) return NAN; | |
for (size_t i = 0; i < size; i += 32) { | |
ymm_t mult1, mult2, mult3, mult4; | |
VEC_LOAD_OFFSET_BASE(mult1, 0, array1); | |
VEC_LOAD_OFFSET_BASE(mult2, 32, array1); | |
VEC_LOAD_OFFSET_BASE(mult3, 64, array1); | |
VEC_LOAD_OFFSET_BASE(mult4, 96, array1); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum1, mult1, 0, array2); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum2, mult2, 32, array2); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum3, mult3, 64, array2); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum4, mult4, 96, array2); | |
array1 += 32; | |
array2 += 32; | |
} | |
sum1 = _mm256_add_ps(sum1, sum2); | |
sum3 = _mm256_add_ps(sum3, sum4); | |
sum1 = _mm256_add_ps(sum1, sum3); | |
ymm_t r2 = _mm256_hadd_ps(sum1, sum1); | |
ymm_t r3 = _mm256_hadd_ps(r2, r2); | |
ymm_t r4 = _mm256_hadd_ps(r3, r3); | |
float total = _mm_cvtss_f32(_mm256_extractf128_ps(r4,0)); | |
return total; | |
} | |
float calc_fma_reordered(float *array1, float *array2, size_t size) { | |
ymm_t sum1 = {0, 0}; | |
ymm_t sum2 = {0, 0}; | |
ymm_t sum3 = {0, 0}; | |
ymm_t sum4 = {0, 0}; | |
if (size % 32 != 0) return NAN; | |
float *end2 = array2 + size; | |
while (array2 < end2) { | |
ymm_t mult1, mult2, mult3, mult4; | |
VEC_LOAD_OFFSET_BASE(mult4, 96, array1); | |
VEC_LOAD_OFFSET_BASE(mult2, 32, array1); | |
VEC_LOAD_OFFSET_BASE(mult3, 64, array1); | |
VEC_LOAD_OFFSET_BASE(mult1, 0, array1); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum4, mult4, 96, array2); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum2, mult2, 32, array2); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum3, mult3, 64, array2); | |
VEC_FMA_SUM_MULT_OFFSET_BASE(sum1, mult1, 0, array2); | |
array1 += 32; | |
array2 += 32; | |
} | |
sum1 = _mm256_add_ps(sum1, sum2); | |
sum3 = _mm256_add_ps(sum3, sum4); | |
sum1 = _mm256_add_ps(sum1, sum3); | |
ymm_t r2 = _mm256_hadd_ps(sum1, sum1); | |
ymm_t r3 = _mm256_hadd_ps(r2, r2); | |
ymm_t r4 = _mm256_hadd_ps(r3, r3); | |
float total = _mm_cvtss_f32(_mm256_extractf128_ps(r4,0)); | |
return total; | |
} | |
float calc_load_only(float *array1, float *array2, size_t size) { | |
if (size % 32 != 0) return NAN; | |
for (size_t i = 0; i < size; i += 32) { | |
ymm_t dummy; | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array2); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array2); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array2); | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array2); | |
array1 += 32; | |
array2 += 32; | |
} | |
return 0.0; | |
} | |
float calc_load_only_reordered(float *array1, float *array2, size_t size) { | |
if (size % 32 != 0) return NAN; | |
float *end2 = array2 + size; | |
while (array2 < end2) { | |
ymm_t dummy; | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array1); | |
VEC_LOAD_OFFSET_BASE(dummy, 96, array2); | |
VEC_LOAD_OFFSET_BASE(dummy, 32, array2); | |
VEC_LOAD_OFFSET_BASE(dummy, 64, array2); | |
VEC_LOAD_OFFSET_BASE(dummy, 0, array2); | |
array1 += 32; | |
array2 += 32; | |
} | |
return 0.0; | |
} | |
int main(int argc, char **argv) { | |
printf("Testing with SIZE=%d...\n", SIZE); | |
size_t size = SIZE; | |
float *array1 = malloc(SIZE * sizeof(float)); | |
float *array2 = malloc(SIZE * sizeof(float)); | |
for (size_t i = 0; i < size; i++) { | |
array1[i] = 1.0; | |
array2[i] = 2.0; | |
} | |
float answer = calc_simple(array1, array2, size); | |
BEST_TIME(calc_simple(array1, array2, size), answer); | |
BEST_TIME(calc_fma(array1, array2, size), answer); | |
BEST_TIME(calc_fma_reordered(array1, array2, size), answer); | |
BEST_TIME(calc_load_only(array1, array2, size), answer); | |
BEST_TIME(calc_load_only_reordered(array1, array2, size), answer); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment