nmoinvaz · May 10, 2026 06:51
diff --git a/benchmark_dist1.cc b/benchmark_dist1.cc
 /* benchmark_dist1.cc -- compare strategies for the dist=1 path in CHUNKMEMSET */

 #include <benchmark/benchmark.h>

 extern "C" {
 #  include "zbuild.h"
 #  include "zutil.h"
 }

 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #  include <arm_neon.h>
 #endif

 #if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
 #  include <intrin.h>
 #  include <immintrin.h>
 #endif

 #if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
 #  include <x86intrin.h>
 #  include <immintrin.h>
 #endif

 #ifdef _MSC_VER
 #  define BENCH_NOINLINE __declspec(noinline)
 #else
 #  define BENCH_NOINLINE __attribute__((noinline))
 #endif

 #define BUFSIZE 8192
 #define OFFSET  64

 /* --- portable variants -------------------------------------------------- */

 BENCH_NOINLINE static uint8_t* dist1_memset(uint8_t *out, uint8_t b, size_t len) {
    memset(out, b, len);
    return out + len;
 }

 BENCH_NOINLINE static uint8_t* dist1_byteloop(uint8_t *out, uint8_t b, size_t len) {
    for (size_t i = 0; i < len; i++)
        out[i] = b;
    return out + len;
 }

 BENCH_NOINLINE static uint8_t* dist1_word_widen(uint8_t *out, uint8_t b, size_t len) {
    uint64_t w = b * 0x0101010101010101ULL;
    while (len >= 8) { memcpy(out, &w, 8); out += 8; len -= 8; }
    if (len & 4) { memcpy(out, &w, 4); out += 4; }
    if (len & 2) { memcpy(out, &w, 2); out += 2; }
    if (len & 1) { *out++ = (uint8_t)w; }
    return out;
 }

 /* --- aarch64 NEON variant ----------------------------------------------- */

 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 BENCH_NOINLINE static uint8_t* dist1_neon(uint8_t *out, uint8_t b, size_t len) {
    uint8x16_t v = vdupq_n_u8(b);
    while (len >= 16) { vst1q_u8(out, v); out += 16; len -= 16; }
    uint8_t tmp[16];
    if (len) {
        vst1q_u8(tmp, v);
        if (len & 8) { memcpy(out, tmp, 8); out += 8; }
        if (len & 4) { memcpy(out, tmp, 4); out += 4; }
        if (len & 2) { memcpy(out, tmp, 2); out += 2; }
        if (len & 1) { *out++ = b; }
    }
    return out;
 }
 #endif

 /* --- x86 SSE2 variant --------------------------------------------------- */

 #ifdef X86_SSE2
 BENCH_NOINLINE static uint8_t* dist1_sse2(uint8_t *out, uint8_t b, size_t len) {
    __m128i v = _mm_set1_epi8((char)b);
    while (len >= 16) {
        _mm_storeu_si128((__m128i *)out, v);
        out += 16;
        len -= 16;
    }
    if (len & 8) { _mm_storel_epi64((__m128i *)out, v); out += 8; }
    if (len & 4) { memcpy(out, &v, 4); out += 4; }
    if (len & 2) { memcpy(out, &v, 2); out += 2; }
    if (len & 1) { *out++ = b; }
    return out;
 }
 #endif

 /* --- x86 AVX2 variant --------------------------------------------------- */

 #ifdef X86_AVX2
 BENCH_NOINLINE static uint8_t* dist1_avx2(uint8_t *out, uint8_t b, size_t len) {
    __m256i v = _mm256_set1_epi8((char)b);
    while (len >= 32) {
        _mm256_storeu_si256((__m256i *)out, v);
        out += 32;
        len -= 32;
    }
    if (len & 16) { _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(v)); out += 16; }
    if (len & 8)  { _mm_storel_epi64((__m128i *)out, _mm256_castsi256_si128(v)); out += 8; }
    if (len & 4)  { uint32_t w4; memcpy(&w4, &v, 4); memcpy(out, &w4, 4); out += 4; }
    if (len & 2)  { uint16_t w2; memcpy(&w2, &v, 2); memcpy(out, &w2, 2); out += 2; }
    if (len & 1)  { *out++ = b; }
    return out;
 }
 #endif

 /* --- x86 AVX-512 variants ---------------------------------------------- */

 #ifdef X86_AVX512
 BENCH_NOINLINE static uint8_t* dist1_avx512(uint8_t *out, uint8_t b, size_t len) {
    __m512i v = _mm512_set1_epi8((char)b);
    while (len >= 64) {
        _mm512_storeu_si512(out, v);
        out += 64;
        len -= 64;
    }
    if (len & 32) { _mm256_storeu_si256((__m256i *)out, _mm512_castsi512_si256(v)); out += 32; }
    if (len & 16) { _mm_storeu_si128((__m128i *)out, _mm512_castsi512_si128(v)); out += 16; }
    if (len & 8)  { _mm_storel_epi64((__m128i *)out, _mm512_castsi512_si128(v)); out += 8; }
    if (len & 4)  { uint32_t w4; memcpy(&w4, &v, 4); memcpy(out, &w4, 4); out += 4; }
    if (len & 2)  { uint16_t w2; memcpy(&w2, &v, 2); memcpy(out, &w2, 2); out += 2; }
    if (len & 1)  { *out++ = b; }
    return out;
 }

 BENCH_NOINLINE static uint8_t* dist1_avx512_mask(uint8_t *out, uint8_t b, size_t len) {
    __m512i v = _mm512_set1_epi8((char)b);
    while (len >= 64) {
        _mm512_storeu_si512(out, v);
        out += 64;
        len -= 64;
    }
    if (len) {
        __mmask64 mask = ((uint64_t)1 << len) - 1;
        _mm512_mask_storeu_epi8(out, mask, v);
        out += len;
    }
    return out;
 }
 #endif

 /* --- benchmark fixture -------------------------------------------------- */

 class dist1: public benchmark::Fixture {
 private:
    uint8_t *buf;

 public:
    void SetUp(::benchmark::State& state) {
        buf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
        if (buf == NULL) { state.SkipWithError("malloc failed"); return; }
        memset(buf, 0xAB, BUFSIZE);
    }
    void TearDown(const ::benchmark::State&) { zng_free_aligned(buf); }

    template<uint8_t* (*FN)(uint8_t*, uint8_t, size_t)>
    void Bench(benchmark::State& state) {
        size_t len = (size_t)state.range(0);
        uint8_t *out = buf + OFFSET;
        uint8_t b = 0x42;
        for (auto _ : state) {
            benchmark::DoNotOptimize(out);
            uint8_t *r = FN(out, b, len);
            benchmark::DoNotOptimize(r);
            benchmark::ClobberMemory();
        }
    }
 };

 #define ARGS Arg(3)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128)->Arg(258)

 #define BENCH(name, fn) \
    BENCHMARK_DEFINE_F(dist1, name)(benchmark::State& state) { Bench<fn>(state); } \
    BENCHMARK_REGISTER_F(dist1, name)->ARGS

 BENCH(memset,    dist1_memset);
 BENCH(byteloop,  dist1_byteloop);
 BENCH(word,      dist1_word_widen);

 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 BENCH(neon,      dist1_neon);
 #endif

 #ifdef X86_SSE2
 BENCH(sse2,      dist1_sse2);
 #endif
 #ifdef X86_AVX2
 BENCH(avx2,      dist1_avx2);
 #endif
 #ifdef X86_AVX512
 BENCH(avx512,       dist1_avx512);
 BENCH(avx512_mask,  dist1_avx512_mask);
 #endif
diff --git a/gist_body.md b/gist_body.md
	/* benchmark_dist1.cc -- compare strategies for the dist=1 path in CHUNKMEMSET */

	#include <benchmark/benchmark.h>

	extern "C" {
	# include "zbuild.h"
	# include "zutil.h"
	}

	#if defined(__ARM_NEON) \|\| defined(__ARM_NEON__)
	# include <arm_neon.h>
	#endif

	#if defined(_MSC_VER) && (defined(_M_X64) \|\| defined(_M_IX86))
	# include <intrin.h>
	# include <immintrin.h>
	#endif

	#if !defined(_MSC_VER) && (defined(__x86_64__) \|\| defined(__i386__))
	# include <x86intrin.h>
	# include <immintrin.h>
	#endif

	#ifdef _MSC_VER
	# define BENCH_NOINLINE __declspec(noinline)
	#else
	# define BENCH_NOINLINE __attribute__((noinline))
	#endif

	#define BUFSIZE 8192
	#define OFFSET 64

	/* --- portable variants -------------------------------------------------- */

	BENCH_NOINLINE static uint8_t* dist1_memset(uint8_t *out, uint8_t b, size_t len) {
	memset(out, b, len);
	return out + len;
	}

	BENCH_NOINLINE static uint8_t* dist1_byteloop(uint8_t *out, uint8_t b, size_t len) {
	for (size_t i = 0; i < len; i++)
	out[i] = b;
	return out + len;
	}

	BENCH_NOINLINE static uint8_t* dist1_word_widen(uint8_t *out, uint8_t b, size_t len) {
	uint64_t w = b * 0x0101010101010101ULL;
	while (len >= 8) { memcpy(out, &w, 8); out += 8; len -= 8; }
	if (len & 4) { memcpy(out, &w, 4); out += 4; }
	if (len & 2) { memcpy(out, &w, 2); out += 2; }
	if (len & 1) { *out++ = (uint8_t)w; }
	return out;
	}

	/* --- aarch64 NEON variant ----------------------------------------------- */

	#if defined(__ARM_NEON) \|\| defined(__ARM_NEON__)
	BENCH_NOINLINE static uint8_t* dist1_neon(uint8_t *out, uint8_t b, size_t len) {
	uint8x16_t v = vdupq_n_u8(b);
	while (len >= 16) { vst1q_u8(out, v); out += 16; len -= 16; }
	uint8_t tmp[16];
	if (len) {
	vst1q_u8(tmp, v);
	if (len & 8) { memcpy(out, tmp, 8); out += 8; }
	if (len & 4) { memcpy(out, tmp, 4); out += 4; }
	if (len & 2) { memcpy(out, tmp, 2); out += 2; }
	if (len & 1) { *out++ = b; }
	}
	return out;
	}
	#endif

	/* --- x86 SSE2 variant --------------------------------------------------- */

	#ifdef X86_SSE2
	BENCH_NOINLINE static uint8_t* dist1_sse2(uint8_t *out, uint8_t b, size_t len) {
	__m128i v = _mm_set1_epi8((char)b);
	while (len >= 16) {
	_mm_storeu_si128((__m128i *)out, v);
	out += 16;
	len -= 16;
	}
	if (len & 8) { _mm_storel_epi64((__m128i *)out, v); out += 8; }
	if (len & 4) { memcpy(out, &v, 4); out += 4; }
	if (len & 2) { memcpy(out, &v, 2); out += 2; }
	if (len & 1) { *out++ = b; }
	return out;
	}
	#endif

	/* --- x86 AVX2 variant --------------------------------------------------- */

	#ifdef X86_AVX2
	BENCH_NOINLINE static uint8_t* dist1_avx2(uint8_t *out, uint8_t b, size_t len) {
	__m256i v = _mm256_set1_epi8((char)b);
	while (len >= 32) {
	_mm256_storeu_si256((__m256i *)out, v);
	out += 32;
	len -= 32;
	}
	if (len & 16) { _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(v)); out += 16; }
	if (len & 8) { _mm_storel_epi64((__m128i *)out, _mm256_castsi256_si128(v)); out += 8; }
	if (len & 4) { uint32_t w4; memcpy(&w4, &v, 4); memcpy(out, &w4, 4); out += 4; }
	if (len & 2) { uint16_t w2; memcpy(&w2, &v, 2); memcpy(out, &w2, 2); out += 2; }
	if (len & 1) { *out++ = b; }
	return out;
	}
	#endif

	/* --- x86 AVX-512 variants ---------------------------------------------- */

	#ifdef X86_AVX512
	BENCH_NOINLINE static uint8_t* dist1_avx512(uint8_t *out, uint8_t b, size_t len) {
	__m512i v = _mm512_set1_epi8((char)b);
	while (len >= 64) {
	_mm512_storeu_si512(out, v);
	out += 64;
	len -= 64;
	}
	if (len & 32) { _mm256_storeu_si256((__m256i *)out, _mm512_castsi512_si256(v)); out += 32; }
	if (len & 16) { _mm_storeu_si128((__m128i *)out, _mm512_castsi512_si128(v)); out += 16; }
	if (len & 8) { _mm_storel_epi64((__m128i *)out, _mm512_castsi512_si128(v)); out += 8; }
	if (len & 4) { uint32_t w4; memcpy(&w4, &v, 4); memcpy(out, &w4, 4); out += 4; }
	if (len & 2) { uint16_t w2; memcpy(&w2, &v, 2); memcpy(out, &w2, 2); out += 2; }
	if (len & 1) { *out++ = b; }
	return out;
	}

	BENCH_NOINLINE static uint8_t* dist1_avx512_mask(uint8_t *out, uint8_t b, size_t len) {
	__m512i v = _mm512_set1_epi8((char)b);
	while (len >= 64) {
	_mm512_storeu_si512(out, v);
	out += 64;
	len -= 64;
	}
	if (len) {
	__mmask64 mask = ((uint64_t)1 << len) - 1;
	_mm512_mask_storeu_epi8(out, mask, v);
	out += len;
	}
	return out;
	}
	#endif

	/* --- benchmark fixture -------------------------------------------------- */

	class dist1: public benchmark::Fixture {
	private:
	uint8_t *buf;

	public:
	void SetUp(::benchmark::State& state) {
	buf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
	if (buf == NULL) { state.SkipWithError("malloc failed"); return; }
	memset(buf, 0xAB, BUFSIZE);
	}
	void TearDown(const ::benchmark::State&) { zng_free_aligned(buf); }

	template<uint8_t* (FN)(uint8_t, uint8_t, size_t)>
	void Bench(benchmark::State& state) {
	size_t len = (size_t)state.range(0);
	uint8_t *out = buf + OFFSET;
	uint8_t b = 0x42;
	for (auto _ : state) {
	benchmark::DoNotOptimize(out);
	uint8_t *r = FN(out, b, len);
	benchmark::DoNotOptimize(r);
	benchmark::ClobberMemory();
	}
	}
	};

	#define ARGS Arg(3)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128)->Arg(258)

	#define BENCH(name, fn) \
	BENCHMARK_DEFINE_F(dist1, name)(benchmark::State& state) { Bench<fn>(state); } \
	BENCHMARK_REGISTER_F(dist1, name)->ARGS

	BENCH(memset, dist1_memset);
	BENCH(byteloop, dist1_byteloop);
	BENCH(word, dist1_word_widen);

	#if defined(__ARM_NEON) \|\| defined(__ARM_NEON__)
	BENCH(neon, dist1_neon);
	#endif

	#ifdef X86_SSE2
	BENCH(sse2, dist1_sse2);
	#endif
	#ifdef X86_AVX2
	BENCH(avx2, dist1_avx2);
	#endif
	#ifdef X86_AVX512
	BENCH(avx512, dist1_avx512);
	BENCH(avx512_mask, dist1_avx512_mask);
	#endif
Variant	Strategy	Arch
`memset`	`memset(out, b, len)` -- current	all
`byteloop`	`for (i=0; i<len; i++) out[i] = b`	all
`word`	`uint64_t w = b * 0x0101...ULL`, then 8/4/2/1 widening copies	all
`neon`	`vdupq_n_u8(b)` + 16-byte `vst1q_u8` chunks + 8/4/2/1 tail	aarch64
`sse2`	`_mm_set1_epi8(b)` + 16-byte `_mm_storeu_si128` chunks + 8/4/2/1 tail	x86_64
`avx2`	`_mm256_set1_epi8(b)` + 32-byte `_mm256_storeu_si256` chunks + widening tail	x86_64
`avx512`	`_mm512_set1_epi8(b)` + 64-byte `_mm512_storeu_si512` chunks + cascading tail	x86_64
`avx512_mask`	`_mm512_set1_epi8(b)` + 64-byte chunks + single masked store for tail	x86_64
len	memset	byteloop	word	neon	winner	vs memset
3	4.08	4.22	2.26	2.71	word	-45%
8	4.07	4.25	4.51	2.71	neon	-33%
16	4.51	4.60	5.45	5.88	memset	0%
32	5.54	5.59	6.01	6.10	memset	0%
64	4.34	4.00	4.19	4.07	byteloop	-8%
128	4.34	4.06	4.06	4.36	word	-6%
258	7.86	7.89	7.06	7.82	word	-10%
len	memset	byteloop	word	sse2	avx2	avx512	avx512_mask	winner	vs memset
3	4.59	5.09	3.10	3.90	4.00	5.48	2.78	avx512_mask	-39%
8	4.64	5.15	10.9	4.17	4.09	4.74	3.29	avx512_mask	-29%
16	4.50	4.99	11.0	4.14	3.79	6.44	2.64	avx512_mask	-41%
32	4.58	4.84	11.7	4.63	3.76	6.97	2.64	avx512_mask	-42%
64	6.23	6.31	12.8	5.97	5.10	5.65	2.63	avx512_mask	-58%
128	5.86	6.33	14.8	7.43	5.94	5.19	4.00	avx512_mask	-32%
258	6.29	6.64	19.2	8.29	9.53	6.10	4.25	avx512_mask	-32%
Arch	Best overall	Best for len <= 8	Best for len >= 64
aarch64	`word`	`word` (-45%)	all within noise
x86_64 (AVX-512)	`avx512_mask`	`avx512_mask` (-39%)	`avx512_mask` (-58%)
x86_64 (no AVX-512)	`avx2`	`word`/`avx2`	`avx2` (-18%)