Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Last active May 10, 2026 06:51
Show Gist options
  • Select an option

  • Save nmoinvaz/7c22772d399444b69f49655b7f181c3f to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/7c22772d399444b69f49655b7f181c3f to your computer and use it in GitHub Desktop.
zlib-ng PR #2286: microbenchmark of memset replacements for the dist=1 path in CHUNKMEMSET
/* benchmark_dist1.cc -- compare strategies for the dist=1 path in CHUNKMEMSET */
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
# include "zutil.h"
}
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
# include <arm_neon.h>
#endif
#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
# include <intrin.h>
# include <immintrin.h>
#endif
#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
# include <x86intrin.h>
# include <immintrin.h>
#endif
#ifdef _MSC_VER
# define BENCH_NOINLINE __declspec(noinline)
#else
# define BENCH_NOINLINE __attribute__((noinline))
#endif
#define BUFSIZE 8192
#define OFFSET 64
/* --- portable variants -------------------------------------------------- */
BENCH_NOINLINE static uint8_t* dist1_memset(uint8_t *out, uint8_t b, size_t len) {
memset(out, b, len);
return out + len;
}
BENCH_NOINLINE static uint8_t* dist1_byteloop(uint8_t *out, uint8_t b, size_t len) {
for (size_t i = 0; i < len; i++)
out[i] = b;
return out + len;
}
BENCH_NOINLINE static uint8_t* dist1_word_widen(uint8_t *out, uint8_t b, size_t len) {
uint64_t w = b * 0x0101010101010101ULL;
while (len >= 8) { memcpy(out, &w, 8); out += 8; len -= 8; }
if (len & 4) { memcpy(out, &w, 4); out += 4; }
if (len & 2) { memcpy(out, &w, 2); out += 2; }
if (len & 1) { *out++ = (uint8_t)w; }
return out;
}
/* --- aarch64 NEON variant ----------------------------------------------- */
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
BENCH_NOINLINE static uint8_t* dist1_neon(uint8_t *out, uint8_t b, size_t len) {
uint8x16_t v = vdupq_n_u8(b);
while (len >= 16) { vst1q_u8(out, v); out += 16; len -= 16; }
uint8_t tmp[16];
if (len) {
vst1q_u8(tmp, v);
if (len & 8) { memcpy(out, tmp, 8); out += 8; }
if (len & 4) { memcpy(out, tmp, 4); out += 4; }
if (len & 2) { memcpy(out, tmp, 2); out += 2; }
if (len & 1) { *out++ = b; }
}
return out;
}
#endif
/* --- x86 SSE2 variant --------------------------------------------------- */
#ifdef X86_SSE2
BENCH_NOINLINE static uint8_t* dist1_sse2(uint8_t *out, uint8_t b, size_t len) {
__m128i v = _mm_set1_epi8((char)b);
while (len >= 16) {
_mm_storeu_si128((__m128i *)out, v);
out += 16;
len -= 16;
}
if (len & 8) { _mm_storel_epi64((__m128i *)out, v); out += 8; }
if (len & 4) { memcpy(out, &v, 4); out += 4; }
if (len & 2) { memcpy(out, &v, 2); out += 2; }
if (len & 1) { *out++ = b; }
return out;
}
#endif
/* --- x86 AVX2 variant --------------------------------------------------- */
#ifdef X86_AVX2
BENCH_NOINLINE static uint8_t* dist1_avx2(uint8_t *out, uint8_t b, size_t len) {
__m256i v = _mm256_set1_epi8((char)b);
while (len >= 32) {
_mm256_storeu_si256((__m256i *)out, v);
out += 32;
len -= 32;
}
if (len & 16) { _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(v)); out += 16; }
if (len & 8) { _mm_storel_epi64((__m128i *)out, _mm256_castsi256_si128(v)); out += 8; }
if (len & 4) { uint32_t w4; memcpy(&w4, &v, 4); memcpy(out, &w4, 4); out += 4; }
if (len & 2) { uint16_t w2; memcpy(&w2, &v, 2); memcpy(out, &w2, 2); out += 2; }
if (len & 1) { *out++ = b; }
return out;
}
#endif
/* --- x86 AVX-512 variants ---------------------------------------------- */
#ifdef X86_AVX512
BENCH_NOINLINE static uint8_t* dist1_avx512(uint8_t *out, uint8_t b, size_t len) {
__m512i v = _mm512_set1_epi8((char)b);
while (len >= 64) {
_mm512_storeu_si512(out, v);
out += 64;
len -= 64;
}
if (len & 32) { _mm256_storeu_si256((__m256i *)out, _mm512_castsi512_si256(v)); out += 32; }
if (len & 16) { _mm_storeu_si128((__m128i *)out, _mm512_castsi512_si128(v)); out += 16; }
if (len & 8) { _mm_storel_epi64((__m128i *)out, _mm512_castsi512_si128(v)); out += 8; }
if (len & 4) { uint32_t w4; memcpy(&w4, &v, 4); memcpy(out, &w4, 4); out += 4; }
if (len & 2) { uint16_t w2; memcpy(&w2, &v, 2); memcpy(out, &w2, 2); out += 2; }
if (len & 1) { *out++ = b; }
return out;
}
BENCH_NOINLINE static uint8_t* dist1_avx512_mask(uint8_t *out, uint8_t b, size_t len) {
__m512i v = _mm512_set1_epi8((char)b);
while (len >= 64) {
_mm512_storeu_si512(out, v);
out += 64;
len -= 64;
}
if (len) {
__mmask64 mask = ((uint64_t)1 << len) - 1;
_mm512_mask_storeu_epi8(out, mask, v);
out += len;
}
return out;
}
#endif
/* --- benchmark fixture -------------------------------------------------- */
class dist1: public benchmark::Fixture {
private:
uint8_t *buf;
public:
void SetUp(::benchmark::State& state) {
buf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
if (buf == NULL) { state.SkipWithError("malloc failed"); return; }
memset(buf, 0xAB, BUFSIZE);
}
void TearDown(const ::benchmark::State&) { zng_free_aligned(buf); }
template<uint8_t* (*FN)(uint8_t*, uint8_t, size_t)>
void Bench(benchmark::State& state) {
size_t len = (size_t)state.range(0);
uint8_t *out = buf + OFFSET;
uint8_t b = 0x42;
for (auto _ : state) {
benchmark::DoNotOptimize(out);
uint8_t *r = FN(out, b, len);
benchmark::DoNotOptimize(r);
benchmark::ClobberMemory();
}
}
};
#define ARGS Arg(3)->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128)->Arg(258)
#define BENCH(name, fn) \
BENCHMARK_DEFINE_F(dist1, name)(benchmark::State& state) { Bench<fn>(state); } \
BENCHMARK_REGISTER_F(dist1, name)->ARGS
BENCH(memset, dist1_memset);
BENCH(byteloop, dist1_byteloop);
BENCH(word, dist1_word_widen);
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
BENCH(neon, dist1_neon);
#endif
#ifdef X86_SSE2
BENCH(sse2, dist1_sse2);
#endif
#ifdef X86_AVX2
BENCH(avx2, dist1_avx2);
#endif
#ifdef X86_AVX512
BENCH(avx512, dist1_avx512);
BENCH(avx512_mask, dist1_avx512_mask);
#endif

CHUNKMEMSET dist=1 path: benchmarking memset replacements

Background

CHUNKMEMSET in zlib-ng has an early bypass for dist == 1:

if (dist == 1) {
    memset(out, *from, len);
    return out + len;
}

This handles back-references where every output byte is the same byte (RLE-1). It's a hot path for repetitive input. Under zlib-ng/zlib-ng#2286 we found benchmarks showing the dist=1 case at ~6 ns on Apple M-series for short lens, dominated by the libc memset call overhead.

We want to know: is memset(out, b, len) actually optimal for the small-len cases that dominate inflate output, or would an inline alternative be faster?

Implementations tested

Variant Strategy Arch
memset memset(out, b, len) -- current all
byteloop for (i=0; i<len; i++) out[i] = b all
word uint64_t w = b * 0x0101...ULL, then 8/4/2/1 widening copies all
neon vdupq_n_u8(b) + 16-byte vst1q_u8 chunks + 8/4/2/1 tail aarch64
sse2 _mm_set1_epi8(b) + 16-byte _mm_storeu_si128 chunks + 8/4/2/1 tail x86_64
avx2 _mm256_set1_epi8(b) + 32-byte _mm256_storeu_si256 chunks + widening tail x86_64
avx512 _mm512_set1_epi8(b) + 64-byte _mm512_storeu_si512 chunks + cascading tail x86_64
avx512_mask _mm512_set1_epi8(b) + 64-byte chunks + single masked store for tail x86_64

Each implementation is marked noinline to defeat the compiler folding all variants to identical instruction sequences. The benchmark loop uses benchmark::DoNotOptimize(out) and benchmark::ClobberMemory() so the writes can't be DCE'd.

Benchmark source

See attached benchmark_dist1.cc. It compiles on both aarch64 (NEON) and x86_64 (SSE2/AVX2/AVX-512) and integrates into the zlib-ng benchmark harness.

Run with:

benchmark_zlib --benchmark_filter='dist1/' --benchmark_min_time=0.5s --benchmark_repetitions=20 --benchmark_report_aggregates_only=true

Results -- Apple M-series (aarch64)

Machine: Apple M-series, macOS, Clang
20 reps, 5s cooldown between cases. All times in nanoseconds (median).

len memset byteloop word neon winner vs memset
3 4.08 4.22 2.26 2.71 word -45%
8 4.07 4.25 4.51 2.71 neon -33%
16 4.51 4.60 5.45 5.88 memset 0%
32 5.54 5.59 6.01 6.10 memset 0%
64 4.34 4.00 4.19 4.07 byteloop -8%
128 4.34 4.06 4.06 4.36 word -6%
258 7.86 7.89 7.06 7.82 word -10%

Observations (aarch64)

  • len <= 8: word and neon beat libc memset by 33-45%. The CRT call overhead (~3-4 ns) dominates at these sizes.
  • len = 16-32: libc memset wins. The call overhead amortizes and libc has well-tuned SIMD for these sizes.
  • len >= 64: all four are within noise of each other (~4-8 ns). The data movement, not the dispatch, dominates.

The word variant looks like the best general replacement on aarch64: never worse than memset by more than ~17%, and significantly faster on the very-short and very-long ends. The crossover at 16-32 is small (~1 ns).


Results -- Intel Tiger Lake (x86_64)

Machine: 11th Gen Intel Core i7-1185G7 @ 3.00GHz (4C/8T), Windows 11, MSVC 19.50
20 reps, 0.5s min_time. All times in nanoseconds (median).

len memset byteloop word sse2 avx2 avx512 avx512_mask winner vs memset
3 4.59 5.09 3.10 3.90 4.00 5.48 2.78 avx512_mask -39%
8 4.64 5.15 10.9 4.17 4.09 4.74 3.29 avx512_mask -29%
16 4.50 4.99 11.0 4.14 3.79 6.44 2.64 avx512_mask -41%
32 4.58 4.84 11.7 4.63 3.76 6.97 2.64 avx512_mask -42%
64 6.23 6.31 12.8 5.97 5.10 5.65 2.63 avx512_mask -58%
128 5.86 6.33 14.8 7.43 5.94 5.19 4.00 avx512_mask -32%
258 6.29 6.64 19.2 8.29 9.53 6.10 4.25 avx512_mask -32%

Observations (x86_64)

  • avx512_mask dominates every length (2.6-4.3 ns). The single masked tail store eliminates all branch/conditional logic, and on Tiger Lake the masked store itself is extremely cheap.
  • word is only competitive at len=3 (3.10 ns) but regresses badly at len>=8 (10.9+ ns). MSVC doesn't fold the 8-byte memcpy in a loop as well as Clang does on aarch64.
  • avx512 (cascading tail) has high CV (13-17%) at medium lengths, likely AVX-512 frequency throttle penalties on Tiger Lake. The masked variant avoids this by completing faster.
  • avx2 is competitive with memset at medium lengths (3.76 ns at len=32 vs memset's 4.58 ns, -18%) and is the best non-AVX512 option.
  • sse2 tracks memset closely at small lengths but falls behind at len>=128 due to more loop iterations.
  • memset is never the worst but never the fastest -- consistent ~4.5-6.3 ns, as expected from MSVC's memset which calls the CRT for variable-length fills.

Cross-platform summary

Arch Best overall Best for len <= 8 Best for len >= 64
aarch64 word word (-45%) all within noise
x86_64 (AVX-512) avx512_mask avx512_mask (-39%) avx512_mask (-58%)
x86_64 (no AVX-512) avx2 word/avx2 avx2 (-18%)

The optimal replacement for memset in the dist=1 path depends on the architecture:

  • On aarch64, a word-widening fill is a safe drop-in that never regresses more than ~1 ns and wins 33-45% at short lengths.
  • On x86_64 with AVX-512, the masked store variant is a clear winner at every length. This is already available on the AVX-512 chunkset path.
  • On x86_64 without AVX-512, avx2 is the best general replacement, beating memset at all lengths.

TODO

  • Inline these variants inside actual CHUNKMEMSET (not noinline) to measure the realistic hybrid.
  • Profile real inflate workloads to see how much of dist=1 traffic falls in the len <= 8 bucket.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment