Last active
November 24, 2025 10:47
-
-
Save HFTrader/180204dda108de08999e043ab07b9bde to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <immintrin.h> | |
| #include <cpuid.h> | |
| #include <cstring> | |
| #include <cstdint> | |
| #include <cstdlib> | |
| #include <cstdio> | |
| #include <unistd.h> | |
| #include <fcntl.h> | |
| #include <sys/mman.h> | |
| #include <errno.h> | |
| #include <algorithm> | |
| // ============================================================================ | |
| // Configuration & Constants | |
| // ============================================================================ | |
| constexpr size_t BUFFER_SIZE = 2 << 20; // 2 MiB per buffer | |
| constexpr size_t TOTAL_IO_SIZE = BUFFER_SIZE * 2; | |
| constexpr size_t BYTECODE_STORAGE_SIZE = (2 << 20) - 512; | |
| // High-decimal constants (digit D is encoded as 246+D) | |
| constexpr uint8_t HD_ZERO = 246; | |
| constexpr uint8_t HD_NINE = 255; | |
| // ============================================================================ | |
| // Global Buffers and Constants | |
| // ============================================================================ | |
| alignas(4096) uint8_t io_buffers[TOTAL_IO_SIZE]; | |
| alignas(4096) uint8_t bytecode_storage[BYTECODE_STORAGE_SIZE]; | |
| // Constant data arrays | |
| alignas(32) const uint8_t lineno_low_init_vals[32] = { | |
| 0, 0, 0, 0, 0, 0, 0, 0xC0, | |
| 0, 0, 0, 0, 0, 0, 0, 0x40, | |
| 0, 0, 0, 0, 0, 0, 0, 0xC0, | |
| 0, 0, 0, 0, 0, 0, 0, 0x40 | |
| }; | |
| alignas(32) const uint8_t lineno_mid_base_vals[32] = { | |
| HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, | |
| 0xF7, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, | |
| HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, | |
| 0xF7, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO | |
| }; | |
| alignas(32) const uint8_t lineno_top_init_vals[32] = { | |
| 0xC6, 0xC5, 0xC4, 0xC3, 0xC2, 0xC1, 0xC0, 0xBF, | |
| HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, | |
| 0xBE, 0xBD, 0xBC, 0xBB, 0xBA, 0xB9, 0xB8, 0xB7, | |
| HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO, HD_ZERO | |
| }; | |
| alignas(32) const uint8_t lineno_top_max_vals[32] = { | |
| 198, 197, 196, 195, 194, 193, 192, 191, | |
| 255, 255, 255, 255, 255, 255, 255, 255, | |
| 190, 189, 188, 187, 186, 185, 184, 183, | |
| 255, 255, 255, 255, 255, 255, 255, 255 | |
| }; | |
| alignas(32) const uint8_t ascii_offset_vals[32] = { | |
| 58, 58, 58, 58, 58, 58, 58, 58, | |
| 58, 58, 58, 58, 58, 58, 58, 58, | |
| 58, 58, 58, 58, 58, 58, 58, 58, | |
| 58, 58, 58, 58, 58, 58, 58, 58 | |
| }; | |
| alignas(32) const uint8_t biascii_offset_vals[32] = { | |
| 58, 59, 60, 61, 62, 63, 64, 65, | |
| 66, 67, 68, 69, 70, 71, 72, 73, | |
| 58, 59, 60, 61, 62, 63, 64, 65, | |
| 66, 67, 68, 69, 70, 71, 72, 73 | |
| }; | |
| alignas(32) const uint8_t bascii_offset_vals[32] = { | |
| 198, 198, 198, 198, 198, 198, 198, 198, | |
| 198, 198, 198, 198, 198, 198, 198, 198, | |
| 198, 198, 198, 198, 198, 198, 198, 198, | |
| 198, 198, 198, 198, 198, 198, 198, 198 | |
| }; | |
| alignas(32) const uint8_t endian_shuffle_init_vals[32] = { | |
| 9, 8, 7, 6, 5, 4, 3, 2, | |
| 1, 0, 255, 254, 253, 252, 251, 250, | |
| 3, 2, 1, 0, 255, 254, 253, 252, | |
| 251, 250, 249, 248, 247, 246, 245, 244 | |
| }; | |
| // ============================================================================ | |
| // State Structure | |
| // ============================================================================ | |
| struct FizzBuzzState { | |
| // Output management | |
| uint8_t* output_ptr = nullptr; | |
| size_t pipe_size = 0; | |
| // Phase tracking | |
| uint32_t lineno_width = 2; | |
| uint32_t groups_of_15 = 6; | |
| // Bytecode (phase 3) | |
| uint8_t* bytecode_start = nullptr; | |
| uint8_t* bytecode_end = nullptr; | |
| uint8_t* bytecode_ip = nullptr; | |
| uint8_t* bytecode_gen_ptr = nullptr; | |
| int64_t bytecode_neg_len = 0; | |
| // Line number registers (phase 3) - high-decimal format | |
| __m256i lineno_low; | |
| __m256i lineno_mid; | |
| __m256i lineno_top; | |
| __m256i lineno_mid_temp; | |
| __m256i lineno_low_incr; | |
| // Constants (in registers) | |
| __m256i lineno_low_init; | |
| __m256i lineno_mid_base; | |
| __m256i lineno_top_max; | |
| __m256i ascii_offset; | |
| __m256i biascii_offset; | |
| __m256i bascii_offset; | |
| __m256i endian_shuffle; | |
| __m256i endian_shuffle_init; | |
| // Loop state | |
| size_t ymms_at_width = 0; | |
| int64_t regen_trigger = -1; | |
| } state; | |
| // ============================================================================ | |
| // Utility Functions | |
| // ============================================================================ | |
| void check_avx2() { | |
| // Simple AVX2 check - generates SIGILL if unsupported | |
| _mm256_and_si256(_mm256_setzero_si256(), _mm256_setzero_si256()); | |
| } | |
| void init_constants() { | |
| state.lineno_low_init = _mm256_loadu_si256((__m256i*)lineno_low_init_vals); | |
| state.lineno_mid_base = _mm256_loadu_si256((__m256i*)lineno_mid_base_vals); | |
| state.lineno_top_max = _mm256_loadu_si256((__m256i*)lineno_top_max_vals); | |
| state.ascii_offset = _mm256_loadu_si256((__m256i*)ascii_offset_vals); | |
| state.biascii_offset = _mm256_loadu_si256((__m256i*)biascii_offset_vals); | |
| state.bascii_offset = _mm256_loadu_si256((__m256i*)bascii_offset_vals); | |
| state.endian_shuffle_init = _mm256_loadu_si256((__m256i*)endian_shuffle_init_vals); | |
| state.endian_shuffle = state.endian_shuffle_init; | |
| state.lineno_top = _mm256_loadu_si256((__m256i*)lineno_top_init_vals); | |
| state.lineno_low = state.lineno_low_init; | |
| } | |
| void setup_pipe_size() { | |
| // Query L2 cache size via CPUID | |
| uint32_t eax, ebx, ecx, edx; | |
| // Check if extended CPUID 0x80000006 is supported | |
| __get_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |
| if (eax < 0x80000006) { | |
| fprintf(stderr, "Error: your CPUID command does not support command " | |
| "0x80000006 (AMD-style L2 cache information).\n"); | |
| exit(59); | |
| } | |
| // Get L2 cache size (in KB, upper 16 bits of ECX) | |
| __get_cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |
| uint32_t l2_cache_kb = ecx >> 16; | |
| if (l2_cache_kb == 0) { | |
| fprintf(stderr, "CPUID returned 0 for L2 cache size\n"); | |
| exit(59); | |
| } | |
| // Calculate pipe size: half of L2 cache | |
| state.pipe_size = (l2_cache_kb * 1024) / 2; | |
| // Request this pipe size via fcntl | |
| int result = fcntl(1, F_SETPIPE_SZ, (int)state.pipe_size); | |
| if (result < 0) { | |
| if (errno == EBADF) { | |
| fprintf(stderr, "This program can only output to a pipe (try piping into `cat`?)\n"); | |
| exit(73); | |
| } | |
| if (errno == EPERM) { | |
| fprintf(stderr, "Cannot allocate a sufficiently large kernel buffer.\n"); | |
| fprintf(stderr, "Try setting /proc/sys/fs/pipe-max-size to 0x%lx.\n", state.pipe_size); | |
| exit(77); | |
| } | |
| perror("fcntl F_SETPIPE_SZ"); | |
| exit(1); | |
| } | |
| if ((size_t)result != state.pipe_size) { | |
| fprintf(stderr, "Failed to resize the kernel pipe buffer.\n"); | |
| fprintf(stderr, "Requested size: 0x%lx\nActual size: 0x%x\n", state.pipe_size, result); | |
| exit(73); | |
| } | |
| // Use huge pages for buffers to reduce TLB pressure | |
| if (madvise(io_buffers, TOTAL_IO_SIZE, MADV_HUGEPAGE) < 0) { | |
| perror("madvise MADV_HUGEPAGE"); | |
| exit(1); | |
| } | |
| } | |
| // ============================================================================ | |
| // Output Functions | |
| // ============================================================================ | |
| void flush_output(size_t bytes_to_write) { | |
| // Write all buffered output to stdout | |
| size_t written = 0; | |
| while (written < bytes_to_write) { | |
| ssize_t result = write(1, io_buffers + written, bytes_to_write - written); | |
| if (result < 0) { | |
| perror("write"); | |
| exit(1); | |
| } | |
| if (result == 0) { | |
| fprintf(stderr, "write returned 0\n"); | |
| exit(1); | |
| } | |
| written += result; | |
| } | |
| } | |
| // ============================================================================ | |
| // Phase 1: Hardcoded Introduction (lines 1-9) | |
| // ============================================================================ | |
| void phase1() { | |
| const char intro[] = "1\n2\nFizz\n4\nBuzz\nFizz\n7\n8\nFizz\n"; | |
| memcpy(state.output_ptr, intro, 30); | |
| state.output_ptr += 30; | |
| } | |
| // ============================================================================ | |
| // Phase 2: Straightforward FizzBuzz (lines 10-99999) | |
| // ============================================================================ | |
| void phase2() { | |
| // Second phase outputs lines 10-99999 | |
| // Uses straightforward sprintf for line numbers (not high-decimal format) | |
| const uint64_t FIZZ = 0x0a7a7a6946ULL; // "Fizz\n" in little-endian | |
| const uint64_t BUZZ = 0x0a7a7a7542ULL; // "Buzz\n" in little-endian | |
| state.lineno_width = 2; | |
| state.groups_of_15 = 6; | |
| for (uint32_t line = 10; line < 100000; ++line) { | |
| if (line % 15 == 0) { | |
| // FizzBuzz: output "Fizz" (4 bytes) + "Buzz\n" (5 bytes) | |
| uint32_t fizz_part = FIZZ & 0xFFFFFFFFUL; | |
| memcpy(state.output_ptr, &fizz_part, 4); | |
| memcpy(state.output_ptr + 4, &BUZZ, 5); | |
| state.output_ptr += 9; | |
| } else if (line % 3 == 0) { | |
| // Fizz\n | |
| memcpy(state.output_ptr, &FIZZ, 5); | |
| state.output_ptr += 5; | |
| } else if (line % 5 == 0) { | |
| // Buzz\n | |
| memcpy(state.output_ptr, &BUZZ, 5); | |
| state.output_ptr += 5; | |
| } else { | |
| // Line number as decimal ASCII | |
| int len = sprintf((char*)state.output_ptr, "%u\n", line); | |
| state.output_ptr += len; | |
| } | |
| // Check if buffer needs flushing (when we exceed one buffer) | |
| if ((state.output_ptr - io_buffers) >= (ptrdiff_t)BUFFER_SIZE) { | |
| size_t bytes_in_buffer = (state.output_ptr - io_buffers) - BUFFER_SIZE; | |
| flush_output(BUFFER_SIZE); | |
| // Move remaining bytes to start of buffer | |
| memmove(io_buffers, io_buffers + BUFFER_SIZE, bytes_in_buffer); | |
| state.output_ptr = io_buffers + bytes_in_buffer; | |
| } | |
| } | |
| state.lineno_width = 6; // Next phase uses 6+ digits | |
| } | |
| // ============================================================================ | |
| // Phase 3: Bytecode Generation and Interpretation | |
| // ============================================================================ | |
| void generate_bytecode() { | |
| // Simplified bytecode generation | |
| // The real assembly generates bytecode for exactly 600 lines | |
| // For this implementation, we'll use a simplified approach | |
| state.bytecode_gen_ptr = state.bytecode_start; | |
| // Bytecode for Fizz and Buzz literals (negative ASCII) | |
| const uint8_t fizz_bytecode[5] = {0xBA, 0x97, 0x86, 0x86, 0xF6}; // -'F' -'i' -'z' -'z' -'\n' | |
| const uint8_t buzz_bytecode[5] = {0xBE, 0x8B, 0x86, 0x86, 0xF6}; // -'B' -'u' -'z' -'z' -'\n' | |
| // Generate exactly 600 lines of bytecode | |
| // Pattern repeats every 30 lines for the unrolled assembly | |
| for (int line = 0; line < 600; ++line) { | |
| uint32_t line_mod_15 = line % 15; | |
| uint32_t line_offset_in_600 = line % 600; | |
| // Determine Fizz/Buzz pattern | |
| bool is_fizz = (line_offset_in_600 % 3) == 0; | |
| bool is_buzz = (line_offset_in_600 % 5) == 0; | |
| if (is_fizz && is_buzz) { | |
| memcpy(state.bytecode_gen_ptr, fizz_bytecode, 5); | |
| state.bytecode_gen_ptr += 5; | |
| memcpy(state.bytecode_gen_ptr, buzz_bytecode, 5); | |
| state.bytecode_gen_ptr += 5; | |
| } else if (is_fizz) { | |
| memcpy(state.bytecode_gen_ptr, fizz_bytecode, 5); | |
| state.bytecode_gen_ptr += 5; | |
| } else if (is_buzz) { | |
| memcpy(state.bytecode_gen_ptr, buzz_bytecode, 5); | |
| state.bytecode_gen_ptr += 5; | |
| } else { | |
| // Line number - simplified: just output in decimal | |
| // In the real assembly, this would be bytecode to extract digits from LINENO_MID | |
| int len = sprintf((char*)state.bytecode_gen_ptr, "%u\n", line); | |
| state.bytecode_gen_ptr += len; | |
| } | |
| } | |
| size_t bytecode_len = state.bytecode_gen_ptr - state.bytecode_start; | |
| // Copy first 512 bytes to end for wraparound | |
| memcpy(state.bytecode_gen_ptr, state.bytecode_start, | |
| std::min(size_t(512), bytecode_len)); | |
| state.bytecode_end = state.bytecode_start + bytecode_len; | |
| state.bytecode_ip = state.bytecode_start; | |
| state.bytecode_neg_len = -(ptrdiff_t)bytecode_len; | |
| } | |
| void interpret_32bytes(__m256i bytecode_chunk, uint8_t* output_ptr, | |
| __m256i lineno_mid_temp) { | |
| // The bytecode interpreter is the hot loop in the original assembly: | |
| // 1. Load bytecode (32 bytes of instructions) | |
| // 2. Use bytecode as shuffle mask on LINENO_MID_TEMP to extract digits | |
| // 3. Subtract bytecode from result to produce ASCII output | |
| // 4. Store 32-byte block | |
| // Use bytecode as shuffle mask to extract partial results | |
| __m256i shuffled = _mm256_shuffle_epi8(lineno_mid_temp, bytecode_chunk); | |
| // Subtract bytecode to get final output | |
| // For literals (byte >= 128): shuffle produces 0, subtract gives the literal | |
| // For digits (byte < 16): shuffle gives ASCII + offset, subtract gives ASCII | |
| __m256i output = _mm256_sub_epi8(shuffled, bytecode_chunk); | |
| // Store output block | |
| _mm256_storeu_si256((__m256i*)output_ptr, output); | |
| } | |
| void phase3() { | |
| // Phase 3: Bytecode-based interpreter for ultra-high-speed output | |
| // This phase generates bytecode and then interprets it using SIMD | |
| // Reinitialize ENDIAN_SHUFFLE for phase 3 (permute to duplicate both halves) | |
| state.endian_shuffle = _mm256_permute4x64_epi64(state.endian_shuffle, 0xEE); | |
| // Set up bytecode storage | |
| state.bytecode_start = bytecode_storage; | |
| state.bytecode_end = state.bytecode_start + 10000; // Simplified | |
| state.bytecode_ip = state.bytecode_start; | |
| state.regen_trigger = -1; | |
| // Initialize line number tracking for large numbers | |
| state.lineno_low = state.lineno_low_init; | |
| state.lineno_mid = state.lineno_mid_base; | |
| state.lineno_top = _mm256_loadu_si256((__m256i*)lineno_top_init_vals); | |
| // Pre-calculate LINENO_MID_TEMP | |
| state.lineno_mid_temp = _mm256_add_epi8(state.biascii_offset, state.lineno_mid); | |
| // Generate bytecode for 600 lines | |
| generate_bytecode(); | |
| // Initialize increment value (simplified for this demo) | |
| state.lineno_low_incr = _mm256_setzero_si256(); | |
| // Main loop: process bytecode (simplified version) | |
| // In the real assembly, this would process 512 bytes per iteration | |
| const int CHUNKS_PER_ITERATION = 16; // 16 x 32-byte = 512 bytes | |
| int iterations = 10; // Just a few for demo | |
| while (state.bytecode_ip < state.bytecode_end && iterations-- > 0) { | |
| // Process 16 chunks (512 bytes) | |
| for (int c = 0; c < CHUNKS_PER_ITERATION; ++c) { | |
| // In real implementation, would load bytecode via intrinsic | |
| // For this version, we'll just use the bytecode data directly | |
| // to demonstrate the interpreter concept | |
| if (state.bytecode_ip + 32 <= state.bytecode_end) { | |
| __m256i bytecode = _mm256_loadu_si256((__m256i*)state.bytecode_ip); | |
| interpret_32bytes(bytecode, state.output_ptr, state.lineno_mid_temp); | |
| state.bytecode_ip += 32; | |
| state.output_ptr += 32; | |
| } | |
| } | |
| // Every 512 bytes, update line number tracking | |
| __m256i old_low = state.lineno_low; | |
| state.lineno_low = _mm256_add_epi64(state.lineno_low, state.lineno_low_incr); | |
| // Detect carry | |
| __m256i carry = _mm256_and_si256(_mm256_xor_si256(old_low, _mm256_set1_epi64x(-1)), | |
| state.lineno_low); | |
| carry = _mm256_srli_epi64(carry, 63); | |
| carry = _mm256_add_epi8(carry, carry); | |
| state.lineno_mid = _mm256_add_epi8(state.lineno_mid, carry); | |
| // Clamp to valid range | |
| state.lineno_mid = _mm256_max_epu8(state.lineno_mid, state.lineno_mid_base); | |
| // Update LINENO_MID_TEMP | |
| state.lineno_mid_temp = _mm256_add_epi8(state.biascii_offset, state.lineno_mid); | |
| // Check buffer overflow | |
| if ((state.output_ptr - io_buffers) >= (ptrdiff_t)BUFFER_SIZE) { | |
| size_t bytes_in_buffer = (state.output_ptr - io_buffers) - BUFFER_SIZE; | |
| flush_output(BUFFER_SIZE); | |
| memmove(io_buffers, io_buffers + BUFFER_SIZE, bytes_in_buffer); | |
| state.output_ptr = io_buffers + bytes_in_buffer; | |
| } | |
| } | |
| } | |
| // ============================================================================ | |
| // Phase 4: Final Output (line 1000000000000000000) | |
| // ============================================================================ | |
| void phase4() { | |
| const char buzz[] = "Buzz\n"; | |
| memcpy(state.output_ptr, buzz, 5); | |
| state.output_ptr += 5; | |
| } | |
| // ============================================================================ | |
| // Main Entry Point | |
| // ============================================================================ | |
| int main() { | |
| try { | |
| check_avx2(); | |
| init_constants(); | |
| setup_pipe_size(); | |
| state.output_ptr = io_buffers; | |
| // Phase 1: Lines 1-9 (hardcoded) | |
| phase1(); | |
| // Phase 2: Lines 10-99999 | |
| phase2(); | |
| // Phase 3: Lines 100000+ using bytecode interpreter | |
| phase3(); | |
| // Phase 4: Final line | |
| phase4(); | |
| // Flush any remaining output | |
| size_t final_bytes = state.output_ptr - io_buffers; | |
| if (final_bytes > 0) { | |
| flush_output(final_bytes); | |
| } | |
| return 0; | |
| } catch (...) { | |
| fprintf(stderr, "Error: Unexpected exception\n"); | |
| return 1; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment