Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Last active May 8, 2026 07:55
Show Gist options
  • Select an option

  • Save nmoinvaz/d5042bf022e9a54d2ea17c621f8a5d4f to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/d5042bf022e9a54d2ea17c621f8a5d4f to your computer and use it in GitHub Desktop.
zlib-ng: hoist strstart and lookahead in deflate_quick (~12% level 1)
diff --git a/deflate_quick.c b/deflate_quick.c
index 6b84388e..41086525 100644
--- a/deflate_quick.c
+++ b/deflate_quick.c
@@ -49,13 +49,19 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
unsigned char *window;
unsigned last = (flush == Z_FINISH) ? 1 : 0;
+ /* Hold strstart and lookahead in registers across the inner loop.
+ * Sync to s before fill_window (which mutates them) and quick_*_block
+ * (which read s->strstart for s->block_start). */
+ unsigned int strstart = s->strstart;
+ unsigned int lookahead = s->lookahead;
+
if (UNLIKELY(last && s->block_open != 2)) {
/* Emit end of previous block */
if (quick_end_block(s, 0))
return need_more;
/* Emit start of last block */
quick_start_block(s, last);
- } else if (UNLIKELY(s->block_open == 0 && s->lookahead > 0)) {
+ } else if (UNLIKELY(s->block_open == 0 && lookahead > 0)) {
/* Start new block only when we have lookahead data, so that if no
input data is given an empty block will not be written */
quick_start_block(s, last);
@@ -64,21 +70,25 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
window = s->window;
for (;;) {
- uint8_t lc;
-
if (UNLIKELY(s->pending + ((BIT_BUF_SIZE + 7) >> 3) >= s->pending_buf_size)) {
PREFIX(flush_pending)(s->strm);
if (s->strm->avail_out == 0) {
+ s->strstart = strstart;
+ s->lookahead = lookahead;
return (last && s->strm->avail_in == 0 && s->bi_valid == 0 && s->block_open == 0) ? finish_started : need_more;
}
}
- if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD)) {
+ if (UNLIKELY(lookahead < MIN_LOOKAHEAD)) {
+ s->strstart = strstart;
+ s->lookahead = lookahead;
PREFIX(fill_window)(s);
- if (UNLIKELY(s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
+ strstart = s->strstart;
+ lookahead = s->lookahead;
+ if (UNLIKELY(lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH)) {
return need_more;
}
- if (UNLIKELY(s->lookahead == 0))
+ if (UNLIKELY(lookahead == 0))
break;
if (UNLIKELY(s->block_open == 0)) {
@@ -88,44 +98,45 @@ Z_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
}
}
- if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
- uint32_t str_val = Z_U32_FROM_LE(zng_memread_4(window + s->strstart));
- uint32_t hash_head = quick_insert_value(s, s->strstart, str_val);
- int64_t dist = (int64_t)s->strstart - hash_head;
- lc = (uint8_t)str_val;
+ /* Always safe: window has high_water padding past lookahead. */
+ uint32_t scan_val = Z_U32_FROM_LE(zng_memread_4(window + strstart));
+
+ if (LIKELY(lookahead >= WANT_MIN_MATCH)) {
+ uint32_t hash_head = quick_insert_value(s, strstart, scan_val);
+ int64_t dist = (int64_t)strstart - hash_head;
if (dist <= MAX_DIST(s) && dist > 0) {
const uint8_t *match_start = window + hash_head;
uint32_t match_val = Z_U32_FROM_LE(zng_memread_4(match_start));
- if (str_val == match_val) {
- const uint8_t *str_start = window + s->strstart;
- uint32_t match_len = FUNCTABLE_CALL(compare256)(str_start+2, match_start+2) + 2;
+ if (scan_val == match_val) {
+ const uint8_t *scan_start = window + strstart;
+ uint32_t match_len = FUNCTABLE_CALL(compare256)(scan_start+2, match_start+2) + 2;
if (match_len >= WANT_MIN_MATCH) {
- if (UNLIKELY(match_len > s->lookahead))
- match_len = s->lookahead;
+ if (UNLIKELY(match_len > lookahead))
+ match_len = lookahead;
Assert(match_len <= STD_MAX_MATCH, "match too long");
- Assert(s->strstart <= UINT16_MAX, "strstart should fit in uint16_t");
- check_match(s, s->strstart, hash_head, (int)match_len);
+ Assert(strstart <= UINT16_MAX, "strstart should fit in uint16_t");
+ check_match(s, strstart, hash_head, (int)match_len);
zng_tr_emit_dist(s, static_ltree, static_dtree, match_len - STD_MIN_MATCH, (uint32_t)dist);
- s->lookahead -= match_len;
- s->strstart += match_len;
+ lookahead -= match_len;
+ strstart += match_len;
continue;
}
}
}
- } else {
- lc = window[s->strstart];
}
- zng_tr_emit_lit(s, static_ltree, lc);
- s->strstart++;
- s->lookahead--;
+ zng_tr_emit_lit(s, static_ltree, (uint8_t)scan_val);
+ strstart++;
+ lookahead--;
}
- s->insert = s->strstart < (STD_MIN_MATCH - 1) ? s->strstart : (STD_MIN_MATCH - 1);
+ s->strstart = strstart;
+ s->lookahead = lookahead;
+ s->insert = strstart < (STD_MIN_MATCH - 1) ? strstart : (STD_MIN_MATCH - 1);
if (UNLIKELY(last)) {
if (quick_end_block(s, 1))
return finish_started;

zlib-ng: hoist strstart and lookahead in deflate_quick

Background

The level-1 deflate strategy deflate_quick runs a tight per-byte loop that reads and writes s->strstart and s->lookahead on every iteration:

  • the lookahead-bound check at the top of the loop (s->lookahead < MIN_LOOKAHEAD, s->lookahead >= WANT_MIN_MATCH)
  • the literal/match fast path reads window + s->strstart, passes s->strstart to quick_insert_value, and accesses s->lookahead for the match cap
  • on a literal: s->strstart++; s->lookahead--
  • on a match: s->lookahead -= match_len; s->strstart += match_len

These are simple unsigned ints inside the deflate state struct, but on AArch64 with Apple clang they get reloaded from / stored to memory on every iteration of the inner loop. Aliasing analysis can't prove that the writes the loop performs through other pointers (notably the s->pending_buf[] writes done by the inlined zng_tr_emit_* helpers) don't touch the addresses of these fields, so the compiler keeps them addressable.

Fix

Lift strstart and lookahead to local lvalues at function entry. Use the locals throughout the inner loop. Sync back to s before the two callouts that observe / mutate them:

  • fill_window (mutates s->strstart and s->lookahead when sliding the window) — sync, call, reload.
  • quick_start_block / quick_end_block (read s->strstart to set s->block_start) — sync before, no reload needed (those helpers don't modify the locals).

flush_pending doesn't touch either field, so the only sync needed around that call is for the early-return path (where we have to commit local state before exiting).

The locals never have their address taken, which means the compiler has a hard guarantee no other pointer in scope can touch them. They can live in registers across the entire inner loop.

Further cleanup: unify the per-iteration scan_val read

Originally the inner loop had two separate code paths for setting the literal byte lc:

if (LIKELY(s->lookahead >= WANT_MIN_MATCH)) {
    uint32_t str_val = Z_U32_FROM_LE(zng_memread_4(window + s->strstart));
    ...
    lc = (uint8_t)str_val;
    if (match path) { emit_dist; continue; }
} else {
    lc = window[s->strstart];   // dedicated 1-byte load
}
zng_tr_emit_lit(s, static_ltree, lc);

The two assignments compute the same byte. zlib-ng's window has high_water padding past lookahead, so a 4-byte read at window + strstart is always safe regardless of how small lookahead is. That lets the load be hoisted above the if and the lc variable retired entirely:

uint32_t scan_val = Z_U32_FROM_LE(zng_memread_4(window + strstart));

if (LIKELY(lookahead >= WANT_MIN_MATCH)) {
    ...
    if (match path) { emit_dist; continue; }
}
zng_tr_emit_lit(s, static_ltree, (uint8_t)scan_val);

The variable rename str_valscan_val (and str_startscan_start elsewhere in the body) parallels the existing match_val / match_start pair on the candidate side of the match comparison.

The inner loop now does a single 4-byte load of scan_val and reuses it for: hash multiplier input, scan_val == match_val equality check, and the final (uint8_t)scan_val for the literal-emit byte. The dedicated ldrb that the prior lc = window[s->strstart] path emitted is gone — the rare lookahead < 4 path now falls through into the same and w_, w_, #0xff the common path uses.

Common-path codegen is unchanged. Function size shrinks by one branch target. C is one variable (and one branch arm) lighter.

Performance

Apple M5, macOS Darwin 25.4.0 arm64, Apple clang 17, Release build, Google Benchmark, 10 reps, aggregates only.

benchmark Δ time
level 1 / 131072 −13.44%
level 1 / 1048576 −12.69%

Compression ratios bit-identical (Δ ratio = 0.00, Δ bytes = 0) at both sizes. Levels 3/6/9 are unaffected because they do not use deflate_quick.

Files

  • deflate_quick.patch — the change.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment