HFTrader · November 24, 2025 10:47
diff --git a/fizzbuzz.cpp b/fizzbuzz.cpp
 #include <immintrin.h>
 #include <cpuid.h>
 #include <cstring>
 #include <cstdint>
 #include <cstdlib>
 #include <cstdio>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <errno.h>
 #include <algorithm>

 // ============================================================================
 // Configuration & Constants
 // ============================================================================

 constexpr size_t BUFFER_SIZE = 2 << 20;           // 2 MiB per buffer
 constexpr size_t TOTAL_IO_SIZE = BUFFER_SIZE * 2;
 constexpr size_t BYTECODE_STORAGE_SIZE = (2 << 20) - 512;

 // High-decimal constants (digit D is encoded as 246+D)
 constexpr uint8_t HD_ZERO = 246;
 constexpr uint8_t HD_NINE = 255;

 // ============================================================================
 // Global Buffers and Constants
 // ============================================================================

 alignas(4096) uint8_t io_buffers[TOTAL_IO_SIZE];
 alignas(4096) uint8_t bytecode_storage[BYTECODE_STORAGE_SIZE];

 // Constant data arrays
 alignas(32) const uint8_t lineno_low_init_vals[32] = {
    0, 0, 0, 0, 0, 0, 0, 0xC0,
    0, 0, 0, 0, 0, 0, 0, 0x40,
    0, 0, 0, 0, 0, 0, 0, 0xC0,
    0, 0, 0, 0, 0, 0, 0, 0x40
 };

 alignas(32) const uint8_t lineno_mid_base_vals[32] = {
    HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
    0xF7, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
    HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
    0xF7, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO
 };

 alignas(32) const uint8_t lineno_top_init_vals[32] = {
    0xC6, 0xC5, 0xC4, 0xC3, 0xC2, 0xC1, 0xC0, 0xBF,
    HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
    0xBE, 0xBD, 0xBC, 0xBB, 0xBA, 0xB9, 0xB8, 0xB7,
    HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO
 };

 alignas(32) const uint8_t lineno_top_max_vals[32] = {
    198, 197, 196, 195, 194, 193, 192, 191,
    255, 255, 255, 255, 255, 255, 255, 255,
    190, 189, 188, 187, 186, 185, 184, 183,
    255, 255, 255, 255, 255, 255, 255, 255
 };

 alignas(32) const uint8_t ascii_offset_vals[32] = {
    58, 58, 58, 58, 58, 58, 58, 58,
    58, 58, 58, 58, 58, 58, 58, 58,
    58, 58, 58, 58, 58, 58, 58, 58,
    58, 58, 58, 58, 58, 58, 58, 58
 };

 alignas(32) const uint8_t biascii_offset_vals[32] = {
    58, 59, 60, 61, 62, 63, 64, 65,
    66, 67, 68, 69, 70, 71, 72, 73,
    58, 59, 60, 61, 62, 63, 64, 65,
    66, 67, 68, 69, 70, 71, 72, 73
 };

 alignas(32) const uint8_t bascii_offset_vals[32] = {
    198, 198, 198, 198, 198, 198, 198, 198,
    198, 198, 198, 198, 198, 198, 198, 198,
    198, 198, 198, 198, 198, 198, 198, 198,
    198, 198, 198, 198, 198, 198, 198, 198
 };

 alignas(32) const uint8_t endian_shuffle_init_vals[32] = {
    9, 8, 7, 6, 5, 4, 3, 2,
    1, 0, 255, 254, 253, 252, 251, 250,
    3, 2, 1, 0, 255, 254, 253, 252,
    251, 250, 249, 248, 247, 246, 245, 244
 };

 // ============================================================================
 // State Structure
 // ============================================================================

 struct FizzBuzzState {
    // Output management
    uint8_t* output_ptr = nullptr;
    size_t pipe_size = 0;

    // Phase tracking
    uint32_t lineno_width = 2;
    uint32_t groups_of_15 = 6;

    // Bytecode (phase 3)
    uint8_t* bytecode_start = nullptr;
    uint8_t* bytecode_end = nullptr;
    uint8_t* bytecode_ip = nullptr;
    uint8_t* bytecode_gen_ptr = nullptr;
    int64_t bytecode_neg_len = 0;

    // Line number registers (phase 3) - high-decimal format
    __m256i lineno_low;
    __m256i lineno_mid;
    __m256i lineno_top;
    __m256i lineno_mid_temp;
    __m256i lineno_low_incr;

    // Constants (in registers)
    __m256i lineno_low_init;
    __m256i lineno_mid_base;
    __m256i lineno_top_max;
    __m256i ascii_offset;
    __m256i biascii_offset;
    __m256i bascii_offset;
    __m256i endian_shuffle;
    __m256i endian_shuffle_init;

    // Loop state
    size_t ymms_at_width = 0;
    int64_t regen_trigger = -1;

 } state;

 // ============================================================================
 // Utility Functions
 // ============================================================================

 void check_avx2() {
    // Simple AVX2 check - generates SIGILL if unsupported
    _mm256_and_si256(_mm256_setzero_si256(), _mm256_setzero_si256());
 }

 void init_constants() {
    state.lineno_low_init = _mm256_loadu_si256((__m256i*)lineno_low_init_vals);
    state.lineno_mid_base = _mm256_loadu_si256((__m256i*)lineno_mid_base_vals);
    state.lineno_top_max = _mm256_loadu_si256((__m256i*)lineno_top_max_vals);
    state.ascii_offset = _mm256_loadu_si256((__m256i*)ascii_offset_vals);
    state.biascii_offset = _mm256_loadu_si256((__m256i*)biascii_offset_vals);
    state.bascii_offset = _mm256_loadu_si256((__m256i*)bascii_offset_vals);
    state.endian_shuffle_init = _mm256_loadu_si256((__m256i*)endian_shuffle_init_vals);

    state.endian_shuffle = state.endian_shuffle_init;
    state.lineno_top = _mm256_loadu_si256((__m256i*)lineno_top_init_vals);
    state.lineno_low = state.lineno_low_init;
 }

 void setup_pipe_size() {
    // Query L2 cache size via CPUID
    uint32_t eax, ebx, ecx, edx;

    // Check if extended CPUID 0x80000006 is supported
    __get_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
    if (eax < 0x80000006) {
        fprintf(stderr, "Error: your CPUID command does not support command "
                "0x80000006 (AMD-style L2 cache information).\n");
        exit(59);
    }

    // Get L2 cache size (in KB, upper 16 bits of ECX)
    __get_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
    uint32_t l2_cache_kb = ecx >> 16;

    if (l2_cache_kb == 0) {
        fprintf(stderr, "CPUID returned 0 for L2 cache size\n");
        exit(59);
    }

    // Calculate pipe size: half of L2 cache
    state.pipe_size = (l2_cache_kb * 1024) / 2;

    // Request this pipe size via fcntl
    int result = fcntl(1, F_SETPIPE_SZ, (int)state.pipe_size);
    if (result < 0) {
        if (errno == EBADF) {
            fprintf(stderr, "This program can only output to a pipe (try piping into `cat`?)\n");
            exit(73);
        }
        if (errno == EPERM) {
            fprintf(stderr, "Cannot allocate a sufficiently large kernel buffer.\n");
            fprintf(stderr, "Try setting /proc/sys/fs/pipe-max-size to 0x%lx.\n", state.pipe_size);
            exit(77);
        }
        perror("fcntl F_SETPIPE_SZ");
        exit(1);
    }
    if ((size_t)result != state.pipe_size) {
        fprintf(stderr, "Failed to resize the kernel pipe buffer.\n");
        fprintf(stderr, "Requested size: 0x%lx\nActual size: 0x%x\n", state.pipe_size, result);
        exit(73);
    }

    // Use huge pages for buffers to reduce TLB pressure
    if (madvise(io_buffers, TOTAL_IO_SIZE, MADV_HUGEPAGE) < 0) {
        perror("madvise MADV_HUGEPAGE");
        exit(1);
    }
 }

 // ============================================================================
 // Output Functions
 // ============================================================================

 void flush_output(size_t bytes_to_write) {
    // Write all buffered output to stdout
    size_t written = 0;
    while (written < bytes_to_write) {
        ssize_t result = write(1, io_buffers + written, bytes_to_write - written);
        if (result < 0) {
            perror("write");
            exit(1);
        }
        if (result == 0) {
            fprintf(stderr, "write returned 0\n");
            exit(1);
        }
        written += result;
    }
 }

 // ============================================================================
 // Phase 1: Hardcoded Introduction (lines 1-9)
 // ============================================================================

 void phase1() {
    const char intro[] = "1\n2\nFizz\n4\nBuzz\nFizz\n7\n8\nFizz\n";
    memcpy(state.output_ptr, intro, 30);
    state.output_ptr += 30;
 }

 // ============================================================================
 // Phase 2: Straightforward FizzBuzz (lines 10-99999)
 // ============================================================================

 void phase2() {
    // Second phase outputs lines 10-99999
    // Uses straightforward sprintf for line numbers (not high-decimal format)

    const uint64_t FIZZ = 0x0a7a7a6946ULL;  // "Fizz\n" in little-endian
    const uint64_t BUZZ = 0x0a7a7a7542ULL;  // "Buzz\n" in little-endian

    state.lineno_width = 2;
    state.groups_of_15 = 6;

    for (uint32_t line = 10; line < 100000; ++line) {
        if (line % 15 == 0) {
            // FizzBuzz: output "Fizz" (4 bytes) + "Buzz\n" (5 bytes)
            uint32_t fizz_part = FIZZ & 0xFFFFFFFFUL;
            memcpy(state.output_ptr, &fizz_part, 4);
            memcpy(state.output_ptr + 4, &BUZZ, 5);
            state.output_ptr += 9;
        } else if (line % 3 == 0) {
            // Fizz\n
            memcpy(state.output_ptr, &FIZZ, 5);
            state.output_ptr += 5;
        } else if (line % 5 == 0) {
            // Buzz\n
            memcpy(state.output_ptr, &BUZZ, 5);
            state.output_ptr += 5;
        } else {
            // Line number as decimal ASCII
            int len = sprintf((char*)state.output_ptr, "%u\n", line);
            state.output_ptr += len;
        }

        // Check if buffer needs flushing (when we exceed one buffer)
        if ((state.output_ptr - io_buffers) >= (ptrdiff_t)BUFFER_SIZE) {
            size_t bytes_in_buffer = (state.output_ptr - io_buffers) - BUFFER_SIZE;
            flush_output(BUFFER_SIZE);
            // Move remaining bytes to start of buffer
            memmove(io_buffers, io_buffers + BUFFER_SIZE, bytes_in_buffer);
            state.output_ptr = io_buffers + bytes_in_buffer;
        }
    }

    state.lineno_width = 6;  // Next phase uses 6+ digits
 }

 // ============================================================================
 // Phase 3: Bytecode Generation and Interpretation
 // ============================================================================

 void generate_bytecode() {
    // Simplified bytecode generation
    // The real assembly generates bytecode for exactly 600 lines
    // For this implementation, we'll use a simplified approach

    state.bytecode_gen_ptr = state.bytecode_start;

    // Bytecode for Fizz and Buzz literals (negative ASCII)
    const uint8_t fizz_bytecode[5] = {0xBA, 0x97, 0x86, 0x86, 0xF6};  // -'F' -'i' -'z' -'z' -'\n'
    const uint8_t buzz_bytecode[5] = {0xBE, 0x8B, 0x86, 0x86, 0xF6};  // -'B' -'u' -'z' -'z' -'\n'

    // Generate exactly 600 lines of bytecode
    // Pattern repeats every 30 lines for the unrolled assembly
    for (int line = 0; line < 600; ++line) {
        uint32_t line_mod_15 = line % 15;
        uint32_t line_offset_in_600 = line % 600;

        // Determine Fizz/Buzz pattern
        bool is_fizz = (line_offset_in_600 % 3) == 0;
        bool is_buzz = (line_offset_in_600 % 5) == 0;

        if (is_fizz && is_buzz) {
            memcpy(state.bytecode_gen_ptr, fizz_bytecode, 5);
            state.bytecode_gen_ptr += 5;
            memcpy(state.bytecode_gen_ptr, buzz_bytecode, 5);
            state.bytecode_gen_ptr += 5;
        } else if (is_fizz) {
            memcpy(state.bytecode_gen_ptr, fizz_bytecode, 5);
            state.bytecode_gen_ptr += 5;
        } else if (is_buzz) {
            memcpy(state.bytecode_gen_ptr, buzz_bytecode, 5);
            state.bytecode_gen_ptr += 5;
        } else {
            // Line number - simplified: just output in decimal
            // In the real assembly, this would be bytecode to extract digits from LINENO_MID
            int len = sprintf((char*)state.bytecode_gen_ptr, "%u\n", line);
            state.bytecode_gen_ptr += len;
        }
    }

    size_t bytecode_len = state.bytecode_gen_ptr - state.bytecode_start;
    // Copy first 512 bytes to end for wraparound
    memcpy(state.bytecode_gen_ptr, state.bytecode_start,
           std::min(size_t(512), bytecode_len));

    state.bytecode_end = state.bytecode_start + bytecode_len;
    state.bytecode_ip = state.bytecode_start;
    state.bytecode_neg_len = -(ptrdiff_t)bytecode_len;
 }

 void interpret_32bytes(__m256i bytecode_chunk, uint8_t* output_ptr,
                       __m256i lineno_mid_temp) {
    // The bytecode interpreter is the hot loop in the original assembly:
    // 1. Load bytecode (32 bytes of instructions)
    // 2. Use bytecode as shuffle mask on LINENO_MID_TEMP to extract digits
    // 3. Subtract bytecode from result to produce ASCII output
    // 4. Store 32-byte block

    // Use bytecode as shuffle mask to extract partial results
    __m256i shuffled = _mm256_shuffle_epi8(lineno_mid_temp, bytecode_chunk);

    // Subtract bytecode to get final output
    // For literals (byte >= 128): shuffle produces 0, subtract gives the literal
    // For digits (byte < 16): shuffle gives ASCII + offset, subtract gives ASCII
    __m256i output = _mm256_sub_epi8(shuffled, bytecode_chunk);

    // Store output block
    _mm256_storeu_si256((__m256i*)output_ptr, output);
 }

 void phase3() {
    // Phase 3: Bytecode-based interpreter for ultra-high-speed output
    // This phase generates bytecode and then interprets it using SIMD

    // Reinitialize ENDIAN_SHUFFLE for phase 3 (permute to duplicate both halves)
    state.endian_shuffle = _mm256_permute4x64_epi64(state.endian_shuffle, 0xEE);

    // Set up bytecode storage
    state.bytecode_start = bytecode_storage;
    state.bytecode_end = state.bytecode_start + 10000;  // Simplified
    state.bytecode_ip = state.bytecode_start;

    state.regen_trigger = -1;

    // Initialize line number tracking for large numbers
    state.lineno_low = state.lineno_low_init;
    state.lineno_mid = state.lineno_mid_base;
    state.lineno_top = _mm256_loadu_si256((__m256i*)lineno_top_init_vals);

    // Pre-calculate LINENO_MID_TEMP
    state.lineno_mid_temp = _mm256_add_epi8(state.biascii_offset, state.lineno_mid);

    // Generate bytecode for 600 lines
    generate_bytecode();

    // Initialize increment value (simplified for this demo)
    state.lineno_low_incr = _mm256_setzero_si256();

    // Main loop: process bytecode (simplified version)
    // In the real assembly, this would process 512 bytes per iteration
    const int CHUNKS_PER_ITERATION = 16;  // 16 x 32-byte = 512 bytes
    int iterations = 10;  // Just a few for demo

    while (state.bytecode_ip < state.bytecode_end && iterations-- > 0) {
        // Process 16 chunks (512 bytes)
        for (int c = 0; c < CHUNKS_PER_ITERATION; ++c) {
            // In real implementation, would load bytecode via intrinsic
            // For this version, we'll just use the bytecode data directly
            // to demonstrate the interpreter concept

            if (state.bytecode_ip + 32 <= state.bytecode_end) {
                __m256i bytecode = _mm256_loadu_si256((__m256i*)state.bytecode_ip);
                interpret_32bytes(bytecode, state.output_ptr, state.lineno_mid_temp);
                state.bytecode_ip += 32;
                state.output_ptr += 32;
            }
        }

        // Every 512 bytes, update line number tracking
        __m256i old_low = state.lineno_low;
        state.lineno_low = _mm256_add_epi64(state.lineno_low, state.lineno_low_incr);

        // Detect carry
        __m256i carry = _mm256_and_si256(_mm256_xor_si256(old_low, _mm256_set1_epi64x(-1)),
                                         state.lineno_low);
        carry = _mm256_srli_epi64(carry, 63);
        carry = _mm256_add_epi8(carry, carry);
        state.lineno_mid = _mm256_add_epi8(state.lineno_mid, carry);

        // Clamp to valid range
        state.lineno_mid = _mm256_max_epu8(state.lineno_mid, state.lineno_mid_base);

        // Update LINENO_MID_TEMP
        state.lineno_mid_temp = _mm256_add_epi8(state.biascii_offset, state.lineno_mid);

        // Check buffer overflow
        if ((state.output_ptr - io_buffers) >= (ptrdiff_t)BUFFER_SIZE) {
            size_t bytes_in_buffer = (state.output_ptr - io_buffers) - BUFFER_SIZE;
            flush_output(BUFFER_SIZE);
            memmove(io_buffers, io_buffers + BUFFER_SIZE, bytes_in_buffer);
            state.output_ptr = io_buffers + bytes_in_buffer;
        }
    }
 }

 // ============================================================================
 // Phase 4: Final Output (line 1000000000000000000)
 // ============================================================================

 void phase4() {
    const char buzz[] = "Buzz\n";
    memcpy(state.output_ptr, buzz, 5);
    state.output_ptr += 5;
 }

 // ============================================================================
 // Main Entry Point
 // ============================================================================

 int main() {
    try {
        check_avx2();
        init_constants();
        setup_pipe_size();

        state.output_ptr = io_buffers;

        // Phase 1: Lines 1-9 (hardcoded)
        phase1();

        // Phase 2: Lines 10-99999
        phase2();

        // Phase 3: Lines 100000+ using bytecode interpreter
        phase3();

        // Phase 4: Final line
        phase4();

        // Flush any remaining output
        size_t final_bytes = state.output_ptr - io_buffers;
        if (final_bytes > 0) {
            flush_output(final_bytes);
        }

        return 0;
    } catch (...) {
        fprintf(stderr, "Error: Unexpected exception\n");
        return 1;
    }
 }
	#include <immintrin.h>
	#include <cpuid.h>
	#include <cstring>
	#include <cstdint>
	#include <cstdlib>
	#include <cstdio>
	#include <unistd.h>
	#include <fcntl.h>
	#include <sys/mman.h>
	#include <errno.h>
	#include <algorithm>

	// ============================================================================
	// Configuration & Constants
	// ============================================================================

	constexpr size_t BUFFER_SIZE = 2 << 20; // 2 MiB per buffer
	constexpr size_t TOTAL_IO_SIZE = BUFFER_SIZE * 2;
	constexpr size_t BYTECODE_STORAGE_SIZE = (2 << 20) - 512;

	// High-decimal constants (digit D is encoded as 246+D)
	constexpr uint8_t HD_ZERO = 246;
	constexpr uint8_t HD_NINE = 255;

	// ============================================================================
	// Global Buffers and Constants
	// ============================================================================

	alignas(4096) uint8_t io_buffers[TOTAL_IO_SIZE];
	alignas(4096) uint8_t bytecode_storage[BYTECODE_STORAGE_SIZE];

	// Constant data arrays
	alignas(32) const uint8_t lineno_low_init_vals[32] = {
	0, 0, 0, 0, 0, 0, 0, 0xC0,
	0, 0, 0, 0, 0, 0, 0, 0x40,
	0, 0, 0, 0, 0, 0, 0, 0xC0,
	0, 0, 0, 0, 0, 0, 0, 0x40
	};

	alignas(32) const uint8_t lineno_mid_base_vals[32] = {
	HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
	0xF7, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
	HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
	0xF7, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO
	};

	alignas(32) const uint8_t lineno_top_init_vals[32] = {
	0xC6, 0xC5, 0xC4, 0xC3, 0xC2, 0xC1, 0xC0, 0xBF,
	HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO,
	0xBE, 0xBD, 0xBC, 0xBB, 0xBA, 0xB9, 0xB8, 0xB7,
	HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO
	};

	alignas(32) const uint8_t lineno_top_max_vals[32] = {
	198, 197, 196, 195, 194, 193, 192, 191,
	255, 255, 255, 255, 255, 255, 255, 255,
	190, 189, 188, 187, 186, 185, 184, 183,
	255, 255, 255, 255, 255, 255, 255, 255
	};

	alignas(32) const uint8_t ascii_offset_vals[32] = {
	58, 58, 58, 58, 58, 58, 58, 58,
	58, 58, 58, 58, 58, 58, 58, 58,
	58, 58, 58, 58, 58, 58, 58, 58,
	58, 58, 58, 58, 58, 58, 58, 58
	};

	alignas(32) const uint8_t biascii_offset_vals[32] = {
	58, 59, 60, 61, 62, 63, 64, 65,
	66, 67, 68, 69, 70, 71, 72, 73,
	58, 59, 60, 61, 62, 63, 64, 65,
	66, 67, 68, 69, 70, 71, 72, 73
	};

	alignas(32) const uint8_t bascii_offset_vals[32] = {
	198, 198, 198, 198, 198, 198, 198, 198,
	198, 198, 198, 198, 198, 198, 198, 198,
	198, 198, 198, 198, 198, 198, 198, 198,
	198, 198, 198, 198, 198, 198, 198, 198
	};

	alignas(32) const uint8_t endian_shuffle_init_vals[32] = {
	9, 8, 7, 6, 5, 4, 3, 2,
	1, 0, 255, 254, 253, 252, 251, 250,
	3, 2, 1, 0, 255, 254, 253, 252,
	251, 250, 249, 248, 247, 246, 245, 244
	};

	// ============================================================================
	// State Structure
	// ============================================================================

	struct FizzBuzzState {
	// Output management
	uint8_t* output_ptr = nullptr;
	size_t pipe_size = 0;

	// Phase tracking
	uint32_t lineno_width = 2;
	uint32_t groups_of_15 = 6;

	// Bytecode (phase 3)
	uint8_t* bytecode_start = nullptr;
	uint8_t* bytecode_end = nullptr;
	uint8_t* bytecode_ip = nullptr;
	uint8_t* bytecode_gen_ptr = nullptr;
	int64_t bytecode_neg_len = 0;

	// Line number registers (phase 3) - high-decimal format
	__m256i lineno_low;
	__m256i lineno_mid;
	__m256i lineno_top;
	__m256i lineno_mid_temp;
	__m256i lineno_low_incr;

	// Constants (in registers)
	__m256i lineno_low_init;
	__m256i lineno_mid_base;
	__m256i lineno_top_max;
	__m256i ascii_offset;
	__m256i biascii_offset;
	__m256i bascii_offset;
	__m256i endian_shuffle;
	__m256i endian_shuffle_init;

	// Loop state
	size_t ymms_at_width = 0;
	int64_t regen_trigger = -1;

	} state;

	// ============================================================================
	// Utility Functions
	// ============================================================================

	void check_avx2() {
	// Simple AVX2 check - generates SIGILL if unsupported
	_mm256_and_si256(_mm256_setzero_si256(), _mm256_setzero_si256());
	}

	void init_constants() {
	state.lineno_low_init = _mm256_loadu_si256((__m256i*)lineno_low_init_vals);
	state.lineno_mid_base = _mm256_loadu_si256((__m256i*)lineno_mid_base_vals);
	state.lineno_top_max = _mm256_loadu_si256((__m256i*)lineno_top_max_vals);
	state.ascii_offset = _mm256_loadu_si256((__m256i*)ascii_offset_vals);
	state.biascii_offset = _mm256_loadu_si256((__m256i*)biascii_offset_vals);
	state.bascii_offset = _mm256_loadu_si256((__m256i*)bascii_offset_vals);
	state.endian_shuffle_init = _mm256_loadu_si256((__m256i*)endian_shuffle_init_vals);

	state.endian_shuffle = state.endian_shuffle_init;
	state.lineno_top = _mm256_loadu_si256((__m256i*)lineno_top_init_vals);
	state.lineno_low = state.lineno_low_init;
	}

	void setup_pipe_size() {
	// Query L2 cache size via CPUID
	uint32_t eax, ebx, ecx, edx;

	// Check if extended CPUID 0x80000006 is supported
	__get_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
	if (eax < 0x80000006) {
	fprintf(stderr, "Error: your CPUID command does not support command "
	"0x80000006 (AMD-style L2 cache information).\n");
	exit(59);
	}

	// Get L2 cache size (in KB, upper 16 bits of ECX)
	__get_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
	uint32_t l2_cache_kb = ecx >> 16;

	if (l2_cache_kb == 0) {
	fprintf(stderr, "CPUID returned 0 for L2 cache size\n");
	exit(59);
	}

	// Calculate pipe size: half of L2 cache
	state.pipe_size = (l2_cache_kb * 1024) / 2;

	// Request this pipe size via fcntl
	int result = fcntl(1, F_SETPIPE_SZ, (int)state.pipe_size);
	if (result < 0) {
	if (errno == EBADF) {
	fprintf(stderr, "This program can only output to a pipe (try piping into `cat`?)\n");
	exit(73);
	}
	if (errno == EPERM) {
	fprintf(stderr, "Cannot allocate a sufficiently large kernel buffer.\n");
	fprintf(stderr, "Try setting /proc/sys/fs/pipe-max-size to 0x%lx.\n", state.pipe_size);
	exit(77);
	}
	perror("fcntl F_SETPIPE_SZ");
	exit(1);
	}
	if ((size_t)result != state.pipe_size) {
	fprintf(stderr, "Failed to resize the kernel pipe buffer.\n");
	fprintf(stderr, "Requested size: 0x%lx\nActual size: 0x%x\n", state.pipe_size, result);
	exit(73);
	}

	// Use huge pages for buffers to reduce TLB pressure
	if (madvise(io_buffers, TOTAL_IO_SIZE, MADV_HUGEPAGE) < 0) {
	perror("madvise MADV_HUGEPAGE");
	exit(1);
	}
	}

	// ============================================================================
	// Output Functions
	// ============================================================================

	void flush_output(size_t bytes_to_write) {
	// Write all buffered output to stdout
	size_t written = 0;
	while (written < bytes_to_write) {
	ssize_t result = write(1, io_buffers + written, bytes_to_write - written);
	if (result < 0) {
	perror("write");
	exit(1);
	}
	if (result == 0) {
	fprintf(stderr, "write returned 0\n");
	exit(1);
	}
	written += result;
	}
	}

	// ============================================================================
	// Phase 1: Hardcoded Introduction (lines 1-9)
	// ============================================================================

	void phase1() {
	const char intro[] = "1\n2\nFizz\n4\nBuzz\nFizz\n7\n8\nFizz\n";
	memcpy(state.output_ptr, intro, 30);
	state.output_ptr += 30;
	}

	// ============================================================================
	// Phase 2: Straightforward FizzBuzz (lines 10-99999)
	// ============================================================================

	void phase2() {
	// Second phase outputs lines 10-99999
	// Uses straightforward sprintf for line numbers (not high-decimal format)

	const uint64_t FIZZ = 0x0a7a7a6946ULL; // "Fizz\n" in little-endian
	const uint64_t BUZZ = 0x0a7a7a7542ULL; // "Buzz\n" in little-endian

	state.lineno_width = 2;
	state.groups_of_15 = 6;

	for (uint32_t line = 10; line < 100000; ++line) {
	if (line % 15 == 0) {
	// FizzBuzz: output "Fizz" (4 bytes) + "Buzz\n" (5 bytes)
	uint32_t fizz_part = FIZZ & 0xFFFFFFFFUL;
	memcpy(state.output_ptr, &fizz_part, 4);
	memcpy(state.output_ptr + 4, &BUZZ, 5);
	state.output_ptr += 9;
	} else if (line % 3 == 0) {
	// Fizz\n
	memcpy(state.output_ptr, &FIZZ, 5);
	state.output_ptr += 5;
	} else if (line % 5 == 0) {
	// Buzz\n
	memcpy(state.output_ptr, &BUZZ, 5);
	state.output_ptr += 5;
	} else {
	// Line number as decimal ASCII
	int len = sprintf((char*)state.output_ptr, "%u\n", line);
	state.output_ptr += len;
	}

	// Check if buffer needs flushing (when we exceed one buffer)
	if ((state.output_ptr - io_buffers) >= (ptrdiff_t)BUFFER_SIZE) {
	size_t bytes_in_buffer = (state.output_ptr - io_buffers) - BUFFER_SIZE;
	flush_output(BUFFER_SIZE);
	// Move remaining bytes to start of buffer
	memmove(io_buffers, io_buffers + BUFFER_SIZE, bytes_in_buffer);
	state.output_ptr = io_buffers + bytes_in_buffer;
	}
	}

	state.lineno_width = 6; // Next phase uses 6+ digits
	}

	// ============================================================================
	// Phase 3: Bytecode Generation and Interpretation
	// ============================================================================

	void generate_bytecode() {
	// Simplified bytecode generation
	// The real assembly generates bytecode for exactly 600 lines
	// For this implementation, we'll use a simplified approach

	state.bytecode_gen_ptr = state.bytecode_start;

	// Bytecode for Fizz and Buzz literals (negative ASCII)
	const uint8_t fizz_bytecode[5] = {0xBA, 0x97, 0x86, 0x86, 0xF6}; // -'F' -'i' -'z' -'z' -'\n'
	const uint8_t buzz_bytecode[5] = {0xBE, 0x8B, 0x86, 0x86, 0xF6}; // -'B' -'u' -'z' -'z' -'\n'

	// Generate exactly 600 lines of bytecode
	// Pattern repeats every 30 lines for the unrolled assembly
	for (int line = 0; line < 600; ++line) {
	uint32_t line_mod_15 = line % 15;
	uint32_t line_offset_in_600 = line % 600;

	// Determine Fizz/Buzz pattern
	bool is_fizz = (line_offset_in_600 % 3) == 0;
	bool is_buzz = (line_offset_in_600 % 5) == 0;

	if (is_fizz && is_buzz) {
	memcpy(state.bytecode_gen_ptr, fizz_bytecode, 5);
	state.bytecode_gen_ptr += 5;
	memcpy(state.bytecode_gen_ptr, buzz_bytecode, 5);
	state.bytecode_gen_ptr += 5;
	} else if (is_fizz) {
	memcpy(state.bytecode_gen_ptr, fizz_bytecode, 5);
	state.bytecode_gen_ptr += 5;
	} else if (is_buzz) {
	memcpy(state.bytecode_gen_ptr, buzz_bytecode, 5);
	state.bytecode_gen_ptr += 5;
	} else {
	// Line number - simplified: just output in decimal
	// In the real assembly, this would be bytecode to extract digits from LINENO_MID
	int len = sprintf((char*)state.bytecode_gen_ptr, "%u\n", line);
	state.bytecode_gen_ptr += len;
	}
	}

	size_t bytecode_len = state.bytecode_gen_ptr - state.bytecode_start;
	// Copy first 512 bytes to end for wraparound
	memcpy(state.bytecode_gen_ptr, state.bytecode_start,
	std::min(size_t(512), bytecode_len));

	state.bytecode_end = state.bytecode_start + bytecode_len;
	state.bytecode_ip = state.bytecode_start;
	state.bytecode_neg_len = -(ptrdiff_t)bytecode_len;
	}

	void interpret_32bytes(__m256i bytecode_chunk, uint8_t* output_ptr,
	__m256i lineno_mid_temp) {
	// The bytecode interpreter is the hot loop in the original assembly:
	// 1. Load bytecode (32 bytes of instructions)
	// 2. Use bytecode as shuffle mask on LINENO_MID_TEMP to extract digits
	// 3. Subtract bytecode from result to produce ASCII output
	// 4. Store 32-byte block

	// Use bytecode as shuffle mask to extract partial results
	__m256i shuffled = _mm256_shuffle_epi8(lineno_mid_temp, bytecode_chunk);

	// Subtract bytecode to get final output
	// For literals (byte >= 128): shuffle produces 0, subtract gives the literal
	// For digits (byte < 16): shuffle gives ASCII + offset, subtract gives ASCII
	__m256i output = _mm256_sub_epi8(shuffled, bytecode_chunk);

	// Store output block
	_mm256_storeu_si256((__m256i*)output_ptr, output);
	}

	void phase3() {
	// Phase 3: Bytecode-based interpreter for ultra-high-speed output
	// This phase generates bytecode and then interprets it using SIMD

	// Reinitialize ENDIAN_SHUFFLE for phase 3 (permute to duplicate both halves)
	state.endian_shuffle = _mm256_permute4x64_epi64(state.endian_shuffle, 0xEE);

	// Set up bytecode storage
	state.bytecode_start = bytecode_storage;
	state.bytecode_end = state.bytecode_start + 10000; // Simplified
	state.bytecode_ip = state.bytecode_start;

	state.regen_trigger = -1;

	// Initialize line number tracking for large numbers
	state.lineno_low = state.lineno_low_init;
	state.lineno_mid = state.lineno_mid_base;
	state.lineno_top = _mm256_loadu_si256((__m256i*)lineno_top_init_vals);

	// Pre-calculate LINENO_MID_TEMP
	state.lineno_mid_temp = _mm256_add_epi8(state.biascii_offset, state.lineno_mid);

	// Generate bytecode for 600 lines
	generate_bytecode();

	// Initialize increment value (simplified for this demo)
	state.lineno_low_incr = _mm256_setzero_si256();

	// Main loop: process bytecode (simplified version)
	// In the real assembly, this would process 512 bytes per iteration
	const int CHUNKS_PER_ITERATION = 16; // 16 x 32-byte = 512 bytes
	int iterations = 10; // Just a few for demo

	while (state.bytecode_ip < state.bytecode_end && iterations-- > 0) {
	// Process 16 chunks (512 bytes)
	for (int c = 0; c < CHUNKS_PER_ITERATION; ++c) {
	// In real implementation, would load bytecode via intrinsic
	// For this version, we'll just use the bytecode data directly
	// to demonstrate the interpreter concept

	if (state.bytecode_ip + 32 <= state.bytecode_end) {
	__m256i bytecode = _mm256_loadu_si256((__m256i*)state.bytecode_ip);
	interpret_32bytes(bytecode, state.output_ptr, state.lineno_mid_temp);
	state.bytecode_ip += 32;
	state.output_ptr += 32;
	}
	}

	// Every 512 bytes, update line number tracking
	__m256i old_low = state.lineno_low;
	state.lineno_low = _mm256_add_epi64(state.lineno_low, state.lineno_low_incr);

	// Detect carry
	__m256i carry = _mm256_and_si256(_mm256_xor_si256(old_low, _mm256_set1_epi64x(-1)),
	state.lineno_low);
	carry = _mm256_srli_epi64(carry, 63);
	carry = _mm256_add_epi8(carry, carry);
	state.lineno_mid = _mm256_add_epi8(state.lineno_mid, carry);

	// Clamp to valid range
	state.lineno_mid = _mm256_max_epu8(state.lineno_mid, state.lineno_mid_base);

	// Update LINENO_MID_TEMP
	state.lineno_mid_temp = _mm256_add_epi8(state.biascii_offset, state.lineno_mid);

	// Check buffer overflow
	if ((state.output_ptr - io_buffers) >= (ptrdiff_t)BUFFER_SIZE) {
	size_t bytes_in_buffer = (state.output_ptr - io_buffers) - BUFFER_SIZE;
	flush_output(BUFFER_SIZE);
	memmove(io_buffers, io_buffers + BUFFER_SIZE, bytes_in_buffer);
	state.output_ptr = io_buffers + bytes_in_buffer;
	}
	}
	}

	// ============================================================================
	// Phase 4: Final Output (line 1000000000000000000)
	// ============================================================================

	void phase4() {
	const char buzz[] = "Buzz\n";
	memcpy(state.output_ptr, buzz, 5);
	state.output_ptr += 5;
	}

	// ============================================================================
	// Main Entry Point
	// ============================================================================

	int main() {
	try {
	check_avx2();
	init_constants();
	setup_pipe_size();

	state.output_ptr = io_buffers;

	// Phase 1: Lines 1-9 (hardcoded)
	phase1();

	// Phase 2: Lines 10-99999
	phase2();

	// Phase 3: Lines 100000+ using bytecode interpreter
	phase3();

	// Phase 4: Final line
	phase4();

	// Flush any remaining output
	size_t final_bytes = state.output_ptr - io_buffers;
	if (final_bytes > 0) {
	flush_output(final_bytes);
	}

	return 0;
	} catch (...) {
	fprintf(stderr, "Error: Unexpected exception\n");
	return 1;
	}
	}
No results found