Last active
July 29, 2022 19:03
-
-
Save dead-claudia/2023261ce67aafedcaecf1fc025365d8 to your computer and use it in GitHub Desktop.
Utility to compare various ways of counting zero and (really) non-zero bytes, including exhaustive tests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Note: there are some Clang-isms here. To compile with GCC, define `__builtin_readcyclecounter` | |
// to read from something like x86's `rdtsc` and return a 64-bit integer with the cycle count. It's | |
// always manipulated relative to a prior call, so it doesn't matter if it wraps around once in the | |
// process. | |
#define RUN_ONLY_TESTS 1 | |
#define RUN_ONLY_BENCHMARK 2 | |
// Set to RUN_ONLY_TESTS to run only tests, RUN_ONLY_BENCHMARK to run only the benchmark, or unset | |
// to run both. | |
// #define RUN_VARIANT RUN_ONLY_BENCHMARK | |
// #define RUN_VARIANT RUN_ONLY_TESTS | |
#define BENCHMARK_ROUNDS 1 | |
#define _GNU_SOURCE | |
#include "sched.h" | |
#include "stdint.h" | |
#include "stdio.h" | |
#include "stdlib.h" | |
#include "time.h" | |
#include "unistd.h" | |
// Very branchy, but follows the definition very closely. | |
static uint32_t get_expected(uint32_t x) { | |
if ((x & 0xFFFFFFFF) == 0) return 0; | |
if ((x & 0xFFFFFF00) == 0) return 1; | |
if ((x & 0xFFFF00FF) == 0) return 1; | |
if ((x & 0xFF00FFFF) == 0) return 1; | |
if ((x & 0x00FFFFFF) == 0) return 1; | |
if ((x & 0xFFFF0000) == 0) return 2; | |
if ((x & 0xFF0000FF) == 0) return 2; | |
if ((x & 0x0000FFFF) == 0) return 2; | |
if ((x & 0x00FFFF00) == 0) return 2; | |
if ((x & 0xFF00FF00) == 0) return 2; | |
if ((x & 0x00FF00FF) == 0) return 2; | |
if ((x & 0x000000FF) == 0) return 3; | |
if ((x & 0x0000FF00) == 0) return 3; | |
if ((x & 0x00FF0000) == 0) return 3; | |
if ((x & 0xFF000000) == 0) return 3; | |
return 4; | |
} | |
uint32_t naive(uint32_t x) { | |
x |= x >> 4; | |
x |= x >> 2; | |
x |= x >> 1; | |
x &= 0x01010101; | |
x += x >> 16; | |
x += x >> 8; | |
return x & 255; | |
} | |
uint32_t naive2(uint32_t x) { | |
return ((x >> 24) != 0) + ((x >> 16 & 0xFF) != 0) + ((x >> 8 & 0xFF) != 0) + ((x & 0xFF) != 0); | |
} | |
uint32_t opt1(uint32_t x) { | |
x |= x >> 4; | |
x |= x >> 2; | |
x |= x >> 1; | |
return ((x & 0x01010101) * 0x01010101) >> 24; | |
} | |
uint32_t opt2(uint32_t x) { | |
x |= (x & 0x7F7F7F7F) + 0x7F7F7F7F; | |
x >>= 7; | |
return ((x & 0x01010101) * 0x01010101) >> 24; | |
} | |
uint32_t opt3(uint32_t x) { | |
x |= (x & 0x7F7F7F7F) + 0x7F7F7F7F; | |
return __builtin_popcount(x & 0x80808080); | |
} | |
static uint32_t mod255(uint32_t x) { | |
return (x + x / 255) & 255; | |
} | |
#define countmore(x, n) \ | |
(((((x) & ~0UL / 255 * 127) + ~0UL / 255 * (127 - (n)) | (x)) & ~0UL / 255 * 128) / 128 % 255) | |
#define countmore_with_optimized_mod_255(x, n) \ | |
mod255(((((x) & ~0UL / 255 * 127) + ~0UL / 255 * (127 - (n)) | (x)) & ~0UL / 255 * 128) / 128) | |
uint32_t seander(uint32_t x) { | |
return countmore(x, 0); | |
} | |
uint32_t seander_opt(uint32_t x) { | |
return countmore_with_optimized_mod_255(x, 0); | |
} | |
uint32_t zero_easy_naive(uint32_t x) { | |
return 4 - naive(x); | |
} | |
uint32_t zero_naive2(uint32_t x) { | |
return ((x >> 24) == 0) + ((x >> 16 & 0xFF) == 0) + ((x >> 8 & 0xFF) == 0) + ((x & 0xFF) == 0); | |
} | |
uint32_t zero_easy_opt1(uint32_t x) { | |
return 4 - opt1(x); | |
} | |
uint32_t zero_easy_opt2(uint32_t x) { | |
return 4 - opt2(x); | |
} | |
uint32_t zero_easy_opt3(uint32_t x) { | |
return 4 - opt3(x); | |
} | |
#define countless(x, n) \ | |
(((~0UL / 255 * (127 + (n)) - ((x) & ~0UL / 255 * 127)) & ~(x) & ~0UL / 255 * 128) / 128 % 255) | |
#define countless_with_optimized_mod_255(x, n) \ | |
mod255(((~0UL / 255 * (127 + (n)) - ((x) & ~0UL / 255 * 127)) & ~(x) & ~0UL / 255 * 128) / 128) | |
uint32_t zero_seander(uint32_t x) { | |
return countless(x, 1); | |
} | |
uint32_t zero_seander_opt(uint32_t x) { | |
return countless_with_optimized_mod_255(x, 1); | |
} | |
static const int MAX_FAILS = 20; | |
static int fails = 0; | |
static void fail(char *name, uint32_t i, uint32_t found, uint32_t expect) { | |
fprintf( | |
stderr, | |
"%s(%02x %02x %02x %02x) = %u, expected %u\n", | |
name, | |
i >> 24 & 0xFF, | |
i >> 16 & 0xFF, | |
i >> 8 & 0xFF, | |
i >> 0 & 0xFF, | |
found, | |
expect); | |
if (++fails == MAX_FAILS) exit(1); | |
} | |
#if !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS | |
static void test_results() { | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = get_expected(i); | |
uint32_t found = naive(i); | |
if (found != expect) fail("naive", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "naive: tested %02x 00 00 00\n", i >> 24); | |
} while (++i != 0); | |
} | |
fprintf(stderr, "naive tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = get_expected(i); | |
uint32_t found = naive2(i); | |
if (found != expect) fail("naive2", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "naive2: tested %02x 00 00 00\n", i >> 24); | |
} while (++i != 0); | |
} | |
fprintf(stderr, "naive2 tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = get_expected(i); | |
uint32_t found = opt1(i); | |
if (found != expect) fail("opt1", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "opt1: tested %02x 00 00 00\n", i >> 24); | |
} while (++i != 0); | |
} | |
fprintf(stderr, "opt1 tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = get_expected(i); | |
uint32_t found = opt2(i); | |
if (found != expect) fail("opt2", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "opt2: tested %02x 00 00 00\n", i >> 24); | |
} while (++i != 0); | |
} | |
fprintf(stderr, "opt2 tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = get_expected(i); | |
uint32_t found = opt3(i); | |
if (found != expect) fail("opt3", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "opt3: tested %02x 00 00 00\n", i >> 24); | |
} while (++i != 0); | |
} | |
fprintf(stderr, "opt3 tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = get_expected(i); | |
uint32_t found = seander(i); | |
if (found != expect) fail("seander", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) fprintf(stderr, "seander: tested %02x 00 00 00\n", i >> 24); | |
} while (++i != 0); | |
} | |
fprintf(stderr, "seander countmore tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = 4 - get_expected(i); | |
uint32_t found = zero_seander(i); | |
if (found != expect) fail("zero_seander", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) { | |
fprintf(stderr, "zero_seander: tested %02x 00 00 00\n", i >> 24); | |
} | |
} while (++i != 0); | |
} | |
fprintf(stderr, "seander countless tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = 4 - get_expected(i); | |
uint32_t found = zero_seander_opt(i); | |
if (found != expect) fail("zero_seander_opt", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) { | |
fprintf(stderr, "zero_seander_opt: tested %02x 00 00 00\n", i >> 24); | |
} | |
} while (++i != 0); | |
} | |
fprintf(stderr, "seander countless with optimized modulo tested\n"); | |
{ | |
uint32_t i = 0; | |
do { | |
uint32_t expect = 4 - get_expected(i); | |
uint32_t found = zero_naive2(i); | |
if (found != expect) fail("zero_naive2", i, found, expect); | |
if ((i & 0x00FFFFFF) == 0) { | |
fprintf(stderr, "zero_naive2: tested %02x 00 00 00\n", i >> 24); | |
} | |
} while (++i != 0); | |
} | |
fprintf(stderr, "zero_naive2 tested\n"); | |
} | |
#endif // !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS | |
#if !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS | |
#define TO_STRING_INNER(X) #X | |
#define TO_STRING(X) TO_STRING_INNER(X) | |
volatile uint32_t threshold_value = 10; | |
static double get_cycles_per_op(uint64_t cycles) { | |
return (double)cycles / (double)((UINT64_C(1) << 32) * BENCHMARK_ROUNDS); | |
} | |
static uint32_t control_noop(uint32_t _x) { | |
asm volatile(""); | |
return 0; | |
} | |
static uint32_t control_and(uint32_t x) { | |
return x & 7; | |
} | |
static void test_perf() { | |
#if defined(RUN_VARIANT) && RUN_VARIANT == RUN_ONLY_BENCHMARK | |
#define perf_test(F) \ | |
fprintf(stderr, "testing " TO_STRING(F) "\n"); \ | |
for (uint32_t i = 0; i < 1 << 16; i++) { \ | |
uint32_t expect = get_expected(i); \ | |
uint32_t found = F(i); \ | |
if (found != expect) fail(TO_STRING(F), i, found, expect); \ | |
} | |
#define perf_test_zero(F) \ | |
fprintf(stderr, "testing " TO_STRING(F) "\n"); \ | |
for (uint32_t i = 0; i < 1 << 16; i++) { \ | |
uint32_t expect = 4 - get_expected(i); \ | |
uint32_t found = F(i); \ | |
if (found != expect) fail(TO_STRING(F), i, found, expect); \ | |
} | |
perf_test(naive); | |
perf_test(naive2); | |
perf_test(opt1); | |
perf_test(opt2); | |
perf_test(opt3); | |
perf_test(seander); | |
perf_test(seander_opt); | |
perf_test_zero(zero_easy_naive); | |
perf_test_zero(zero_naive2); | |
perf_test_zero(zero_easy_opt1); | |
perf_test_zero(zero_easy_opt2); | |
perf_test_zero(zero_easy_opt3); | |
perf_test_zero(zero_seander); | |
perf_test_zero(zero_seander_opt); | |
fprintf(stderr, "functions tested\n"); | |
#endif // defined(RUN_VARIANT) && RUN_VARIANT == RUN_ONLY_BENCHMARK | |
// Don't let it move between CPUs. | |
cpu_set_t cpu_set; | |
CPU_ZERO(&cpu_set); | |
CPU_SET(sched_getcpu(), &cpu_set); | |
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set); | |
uint64_t span, start, end; | |
uint64_t threshold = threshold_value; | |
#define perf_run(F) \ | |
for (int i = 0; i < 1 << 24; i++) { \ | |
if (__builtin_expect(F(i) > threshold, 0)) { \ | |
fprintf(stderr, "fail " TO_STRING(F)); \ | |
abort(); \ | |
} \ | |
} \ | |
span = 0; \ | |
for (int j = 0; j < BENCHMARK_ROUNDS; j++) { \ | |
start = __builtin_readcyclecounter(); \ | |
uint32_t i = 0; \ | |
do { \ | |
if (__builtin_expect(F(i) > threshold, 0)) { \ | |
fprintf(stderr, "fail " TO_STRING(F)); \ | |
abort(); \ | |
} \ | |
} while (++i != 0); \ | |
end = __builtin_readcyclecounter(); \ | |
span += end - start; \ | |
} | |
perf_run(control_noop); | |
fprintf(stderr, " control noop: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(control_and); | |
fprintf(stderr, " control and: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(naive); | |
fprintf(stderr, " naive: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(naive2); | |
fprintf(stderr, " naive2: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(opt1); | |
fprintf(stderr, " opt1: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(opt2); | |
fprintf(stderr, " opt2: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(opt3); | |
fprintf(stderr, " opt3: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(seander); | |
fprintf(stderr, " seander: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(seander_opt); | |
fprintf(stderr, " seander_opt: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_easy_naive); | |
fprintf(stderr, " zero_easy_naive: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_naive2); | |
fprintf(stderr, " zero_naive2: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_easy_opt1); | |
fprintf(stderr, " zero_easy_opt1: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_easy_opt2); | |
fprintf(stderr, " zero_easy_opt2: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_easy_opt3); | |
fprintf(stderr, " zero_easy_opt3: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_seander); | |
fprintf(stderr, " zero_seander: %.4f cycles/op\n", get_cycles_per_op(span)); | |
perf_run(zero_seander_opt); | |
fprintf(stderr, "zero_seander_opt: %.4f cycles/op\n", get_cycles_per_op(span)); | |
} | |
#endif // !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS | |
int main() { | |
#if !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS | |
test_results(); | |
#endif // !defined(RUN_VARIANT) || RUN_VARIANT == RUN_ONLY_TESTS | |
#if !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS | |
test_perf(); | |
#endif // !defined(RUN_VARIANT) || RUN_VARIANT != RUN_ONLY_TESTS | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment