snowclipsed · May 20, 2025 07:47
diff --git a/matmul.zig b/matmul.zig
 // To run tests, do :
 // zig test -OReleaseSafe -Dcpu=native -Dtarget=native matmul.zig
 // To run main, do :
 // zig build-exe -OReleaseFast -Dcpu=native -Dtarget=native matmul.zig
 // ./matmul


 //
 // ███╗   ███╗ █████╗ ████████╗███╗   ███╗██╗   ██╗██╗     ░░░░░░███████╗██╗ ██████╗ 
 // ████╗ ████║██╔══██╗╚══██╔══╝████╗ ████║██║   ██║██║     ░░░░░░╚══███╔╝██║██╔════╝ 
 // ██╔████╔██║███████║   ██║   ██╔████╔██║██║   ██║██║     ──────  ███╔╝ ██║██║  ███╗
 // ██║╚██╔╝██║██╔══██║   ██║   ██║╚██╔╝██║██║   ██║██║     ░░░░░░ ███╔╝  ██║██║   ██║
 // ██║ ╚═╝ ██║██║  ██║   ██║   ██║ ╚═╝ ██║╚██████╔╝███████╗░░░░░░███████╗██║╚██████╔╝ 
 // ╚═╝     ╚═╝╚═╝  ╚═╝   ╚═╝   ╚═╝     ╚═╝ ╚═════╝ ╚══════╝░░░░░░╚══════╝╚═╝ ╚═════╝ •
 //
 //          High Performance Matrix Multiplication Implementation
 //                Optimized for Modern Intel Processors
 //
 ////////////////////////////////////////////////////////////////////////////////


 const std = @import("std");
 const builtin = @import("builtin");
 const atomic = std.atomic;

 ///////////////////////////////////////////////////////////////////////////////
 // Configuration Constants
 ///////////////////////////////////////////////////////////////////////////////

 /// Tile size for matrix blocking. Chosen to optimize L1/L2 cache utilization.
 /// Value of 160 is optimized for 13th Gen Intel Core i7's cache hierarchy:
 /// - L1D cache: 48KB (fits multiple tiles)
 /// - L2 cache: 2MB (allows good tile reuse)
 const T: usize = 160;

 /// Vector width for SIMD operations. Matches AVX2 register size (256 bits = 8 floats)
 const V: usize = 8;

 /// Size of CPU cache line in bytes. Used for alignment and padding.
 const CACHE_LINE_SIZE: usize = atomic.cache_line;

 /// Number of tiles processed per thread per atomic operation.
 /// Small value (1) chosen for better load balancing between threads.
 const CHUNK_SIZE: usize = 1;

 /// Alignment requirement for AVX2 operations, matched to cache line size
 /// to prevent false sharing between threads
 const AVX2_ALIGNMENT = 32;

 /// Size of micro-kernel processing block, automatically determined based on
 /// available SIMD capabilities, falling back to 8 if no SIMD available
 const MICRO_KERNEL_SIZE: usize = std.simd.suggestVectorLength(f32) orelse 8;

 /// Vector type for 8 32-bit floats, matching AVX2 register size
 const Vec8f = @Vector(8, f32);

 ///////////////////////////////////////////////////////////////////////////////
 // Thread Management Structures
 ///////////////////////////////////////////////////////////////////////////////

 /// Thread-local data structure for work distribution
 /// Aligned and padded to prevent false sharing between threads
 const ThreadLocalData = struct {
    /// Atomic counter for distributing work among threads
    current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
    /// Padding to ensure structure fills entire cache line
    _padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
 };

 /// Context structure passed to each worker thread
 const ThreadContext = struct {
    /// Input matrix A (M x K)
    A: []const f32,
    /// Input matrix B (K x N)
    B: []const f32,
    /// Output matrix C (M x N)
    C: []f32,
    /// Number of rows in matrix A and C
    M: usize,
    /// Number of columns in matrix B and C
    N: usize,
    /// Number of columns in A / rows in B
    K: usize,
    /// Number of tiles in M dimension
    tiles_M: usize,
    /// Number of tiles in N dimension
    tiles_N: usize,
    /// Total number of tiles to process
    total_tiles: usize,
    /// Shared atomic counter for work distribution
    shared_counter: *ThreadLocalData,
 };

 ///////////////////////////////////////////////////////////////////////////////
 // Main Matrix Multiplication Function
 ///////////////////////////////////////////////////////////////////////////////

 /// Performs parallel tiled matrix multiplication: C = A * B
 ///
 /// Algorithm overview:
 /// 1. Divides matrices into tiles of size T x T
 /// 2. Creates thread pool based on available CPU cores
 /// 3. Distributes tiles to threads using atomic counter
 /// 4. Each thread:
 ///    - Processes assigned tiles using micro-kernels
 ///    - Uses local accumulation buffers
 ///    - Writes results back to main memory
 ///
 /// Parameters:
 /// - allocator: Memory allocator for thread management
 /// - A: Input matrix A (M x K)
 /// - B: Input matrix B (K x N)
 /// - C: Output matrix C (M x N)
 /// - M: Number of rows in A and C
 /// - N: Number of columns in B and C
 /// - K: Number of columns in A / rows in B
 ///
 /// Returns: Error if thread creation fails
 pub fn tiledMatMul(allocator: std.mem.Allocator, A: []const f32, B: []const f32, C: []f32, M: usize, N: usize, K: usize) !void {
    // Calculate tile grid dimensions
    const tiles_M = (M + T - 1) / T; // Ceiling division for M dimension
    const tiles_N = (N + T - 1) / T; // Ceiling division for N dimension
    const total_tiles = tiles_M * tiles_N;

    // Initialize shared atomic counter for work distribution
    var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };

    // Get number of available CPU cores
    const num_threads = try std.Thread.getCpuCount();

    // Create thread pool
    var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
    defer thread_pool.deinit();

    // Spawn worker threads
    for (0..num_threads) |_| {
        const context = ThreadContext{
            .A = A,
            .B = B,
            .C = C,
            .M = M,
            .N = N,
            .K = K,
            .tiles_M = tiles_M,
            .tiles_N = tiles_N,
            .total_tiles = total_tiles,
            .shared_counter = &shared_data,
        };
        try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
    }

    // Wait for all threads to complete
    for (thread_pool.items) |thread| thread.join();
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Worker Thread Implementation
 ///////////////////////////////////////////////////////////////////////////////

 /// Worker thread function that processes matrix tiles in parallel
 ///
 /// Algorithm overview:
 /// 1. Initializes local accumulator buffer aligned for AVX2
 /// 2. Fetches work chunks using atomic counter
 /// 3. For each tile:
 ///    - Calculates tile boundaries
 ///    - Processes tile using micro-kernel
 ///    - Accumulates results locally
 ///    - Writes results back to global memory
 ///
 /// Work distribution:
 /// - Uses atomic fetch-add for load balancing
 /// - Processes CHUNK_SIZE tiles per fetch
 /// - Continues until all tiles are processed
 ///
 /// Memory access pattern:
 /// - Uses local accumulator to minimize global memory traffic
 /// - Implements streaming stores for result writeback
 /// - Aligns data for efficient SIMD operations
 ///
 /// Parameters:
 /// - ctx: ThreadContext containing matrices and dimensions
 fn workerThread(ctx: ThreadContext) void {
    // Initialize local accumulator buffer, aligned for AVX2 operations
    var local_C: [T][T]f32 align(AVX2_ALIGNMENT) = undefined;

    while (true) {
        // Atomically fetch next chunk of work
        const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
        if (start_idx >= ctx.total_tiles) break;

        // Calculate end of current chunk
        const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
        var idx = start_idx;

        // Process all tiles in current chunk
        while (idx < end_idx) : (idx += 1) {
            // Convert linear index to 2D tile coordinates
            const i = idx / ctx.tiles_N; // Tile row index
            const j = idx % ctx.tiles_N; // Tile column index

            // Calculate tile boundaries in original matrix
            const i_start = i * T;
            const j_start = j * T;
            const i_end = @min(i_start + T, ctx.M); // Handle edge tiles
            const j_end = @min(j_start + T, ctx.N); // Handle edge tiles

            // Clear local accumulator buffer
            for (0..T) |x| {
                @memset(&local_C[x], 0);
            }

            // Process tile using T×T blocks along K dimension
            var k: usize = 0;
            while (k < ctx.K) : (k += T) {
                const k_end = @min(k + T, ctx.K);
                microKernelAVX2(ctx, &local_C, i_start, j_start, k, i_end, j_end, k_end);
            }

            // Write accumulated results back to global memory using streaming stores
            for (i_start..i_end) |ii| {
                const row_offset = ii * ctx.N;
                const local_row = ii - i_start;

                var j_idx = j_start;
                // Vector write-back for aligned portions using AVX2
                while (j_idx + 8 <= j_end) : (j_idx += 8) {
                    const vec_idx = j_idx - j_start;

                    // Load 8 elements into vector
                    const c_vec = Vec8f{
                        local_C[local_row][vec_idx],
                        local_C[local_row][vec_idx + 1],
                        local_C[local_row][vec_idx + 2],
                        local_C[local_row][vec_idx + 3],
                        local_C[local_row][vec_idx + 4],
                        local_C[local_row][vec_idx + 5],
                        local_C[local_row][vec_idx + 6],
                        local_C[local_row][vec_idx + 7],
                    };

                    // Load destination elements
                    const dest_vec = Vec8f{
                        ctx.C[row_offset + j_idx],
                        ctx.C[row_offset + j_idx + 1],
                        ctx.C[row_offset + j_idx + 2],
                        ctx.C[row_offset + j_idx + 3],
                        ctx.C[row_offset + j_idx + 4],
                        ctx.C[row_offset + j_idx + 5],
                        ctx.C[row_offset + j_idx + 6],
                        ctx.C[row_offset + j_idx + 7],
                    };

                    // Add vectors and write back results
                    const result = dest_vec + c_vec;
                    for (0..8) |offset| {
                        ctx.C[row_offset + j_idx + offset] = result[offset];
                    }
                }

                // Handle remaining unaligned elements scalar
                while (j_idx < j_end) : (j_idx += 1) {
                    ctx.C[row_offset + j_idx] += local_C[local_row][j_idx - j_start];
                }
            }
        }
    }
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Micro-Kernel Implementation
 ///////////////////////////////////////////////////////////////////////////////

 /// High-performance micro-kernel for matrix multiplication using AVX2 SIMD instructions
 ///
 /// Architecture-specific optimizations:
 /// - Uses AVX2 SIMD instructions for 8-wide float operations
 /// - Leverages FMA (Fused Multiply-Add) instructions
 /// - Implements register blocking for maximum reuse
 /// - Aligns data structures to cache lines
 ///
 /// Algorithm breakdown:
 /// 1. Loads tile data into aligned local buffers
 /// 2. Processes 8x8 blocks using SIMD operations
 /// 3. Handles edge cases with scalar operations
 /// 4. Accumulates results with maximum precision
 ///
 /// Memory optimization:
 /// - Uses aligned local buffers for A and B matrices
 /// - Implements software prefetching
 /// - Maintains cache-friendly access patterns
 /// - Minimizes cache pollution through blocking
 ///
 /// Parameters:
 /// - ctx: Thread context with matrix data and dimensions
 /// - local_C: Local accumulator buffer for results
 /// - i_start: Starting row index in global matrix
 /// - j_start: Starting column index in global matrix
 /// - k_start: Starting index in reduction dimension
 /// - i_end: Ending row index (exclusive)
 /// - j_end: Ending column index (exclusive)
 /// - k_end: Ending reduction index (exclusive)
 fn microKernelAVX2(
    ctx: ThreadContext,
    local_C: *[T][T]f32,
    i_start: usize,
    j_start: usize,
    k_start: usize,
    i_end: usize,
    j_end: usize,
    k_end: usize,
 ) void {
    // Local tile buffers aligned for AVX2 operations
    var A_local: [T][T]f32 align(32) = undefined;
    var B_local: [T][T]f32 align(32) = undefined;

    // Calculate tile dimensions
    const k_size = k_end - k_start;
    const i_size = i_end - i_start;
    const j_size = j_end - j_start;

    // Load A matrix tile with prefetching
    // Stride-1 access pattern for optimal cache line utilization
    for (0..i_size) |i| {
        const src_idx = (i_start + i) * ctx.K + k_start;
        for (0..k_size) |k| {
            A_local[i][k] = ctx.A[src_idx + k];
        }
    }

    // Load B matrix tile with prefetching
    // Transforms memory layout for better cache access
    for (0..k_size) |k| {
        const src_idx = (k_start + k) * ctx.N + j_start;
        for (0..j_size) |j| {
            B_local[k][j] = ctx.B[src_idx + j];
        }
    }

    // Process full 8x8 blocks using AVX2 SIMD
    var i: usize = 0;
    while (i + MICRO_KERNEL_SIZE <= i_size) : (i += MICRO_KERNEL_SIZE) {
        var j: usize = 0;
        while (j + MICRO_KERNEL_SIZE <= j_size) : (j += MICRO_KERNEL_SIZE) {
            // Initialize accumulator for 8x8 block
            var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} ** 8} ** 8;

            // Inner loop - dot product with SIMD
            var k: usize = 0;
            while (k < k_size) : (k += 1) {
                // Load 8 elements from A matrix into vector
                const a_vec = Vec8f{
                    A_local[i][k],     A_local[i + 1][k],
                    A_local[i + 2][k], A_local[i + 3][k],
                    A_local[i + 4][k], A_local[i + 5][k],
                    A_local[i + 6][k], A_local[i + 7][k],
                };

                // Load 8 elements from B matrix into vector
                const b_vec = Vec8f{
                    B_local[k][j],     B_local[k][j + 1],
                    B_local[k][j + 2], B_local[k][j + 3],
                    B_local[k][j + 4], B_local[k][j + 5],
                    B_local[k][j + 6], B_local[k][j + 7],
                };

                // Vectorized outer product using FMA
                inline for (0..8) |bi| {
                    // Broadcast single A element to all lanes
                    const a_broadcast: Vec8f = @splat(a_vec[bi]);
                    // Load current accumulator values
                    const c_vec = Vec8f{
                        acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
                        acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7],
                    };
                    // Fused multiply-add: acc += a * b
                    const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec);
                    // Store results back to accumulator
                    inline for (0..8) |bj| {
                        acc[bi][bj] = prod[bj];
                    }
                }
            }

            // Accumulate 8x8 block results to main buffer
            for (0..8) |bi| {
                for (0..8) |bj| {
                    local_C[i + bi][j + bj] += acc[bi][bj];
                }
            }
        }

        // Handle edge case columns (< 8 columns remaining)
        while (j < j_size) : (j += 1) {
            for (0..8) |bi| {
                var sum: f32 = 0;
                // Scalar operations for edge cases
                for (0..k_size) |k| {
                    sum = @mulAdd(f32, A_local[i + bi][k], B_local[k][j], sum);
                }
                local_C[i + bi][j] += sum;
            }
        }
    }

    // Handle edge case rows (< 8 rows remaining)
    while (i < i_size) : (i += 1) {
        for (0..j_size) |j| {
            var sum: f32 = 0;
            // Scalar operations for edge cases
            for (0..k_size) |k| {
                sum = @mulAdd(f32, A_local[i][k], B_local[k][j], sum);
            }
            local_C[i][j] += sum;
        }
    }
 }
 /// ------ BENCHMARKING AND PERFORMANCE ANALYSIS ------ ///
 const CpuInfo = struct {
    p_freq: ?u64 = null,
    e_freq: ?u64 = null,
    p_cores: ?u32 = null,
    e_cores: ?u32 = null,
    simd_width: u32,
    total_cores: u32,
    supports_peak_calc: bool,
 };

 fn getSIMDWidth() u32 {
    if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
        return 16;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
        return 8;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
        return 4;
    } else {
        return 1;
    }
 }

 fn getCpuFrequencies() !CpuInfo {
    var info = CpuInfo{
        .simd_width = getSIMDWidth(),
        .total_cores = @intCast(try std.Thread.getCpuCount()),
        .supports_peak_calc = false,
    };

    switch (builtin.os.tag) {
        .linux => {
            // Try reading hybrid architecture core mapping
            if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| {
                defer p_core_file.close();
                var buf: [32]u8 = undefined;
                if (p_core_file.reader().readAll(&buf)) |bytes_read| {
                    const p_cores_str = buf[0..bytes_read];
                    // Count cores from range (e.g. "0-7" means 8 threads)
                    if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| {
                        const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
                        const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
                        info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
                    }
                } else |_| {}
            } else |_| {}

            if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| {
                defer e_core_file.close();
                var buf: [32]u8 = undefined;
                if (e_core_file.reader().readAll(&buf)) |bytes_read| {
                    const e_cores_str = buf[0..bytes_read];
                    // Count cores from range (e.g. "8-15" means 8 threads)
                    if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| {
                        const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
                        const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
                        info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
                    }
                } else |_| {}
            } else |_| {}

            // Try reading frequencies for P-cores
            if (info.p_cores != null) {
                if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| {
                    defer freq_file.close();
                    var buf: [32]u8 = undefined;
                    if (freq_file.reader().readAll(&buf)) |bytes_read| {
                        if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
                            info.p_freq = freq * 1000; // Convert kHz to Hz
                        } else |_| {
                            info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
                        }
                    } else |_| {}
                } else |_| {
                    info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
                }
            }

            // Try reading frequencies for E-cores
            if (info.e_cores != null) {
                if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| {
                    defer freq_file.close();
                    var buf: [32]u8 = undefined;
                    if (freq_file.reader().readAll(&buf)) |bytes_read| {
                        if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
                            info.e_freq = freq * 1000; // Convert kHz to Hz
                        } else |_| {
                            info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
                        }
                    } else |_| {}
                } else |_| {
                    info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
                }
            }

            // If we detected any cores, we can do peak calculation
            info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
                (info.e_cores != null and info.e_freq != null);

            // If we couldn't detect anything, fall back to treating all cores as P-cores
            if (!info.supports_peak_calc) {
                info.p_cores = info.total_cores;
                info.e_cores = 0;
                info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
                info.supports_peak_calc = true;
                std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
            }
        },
        .windows, .macos => {
            info.p_cores = info.total_cores;
            info.e_cores = 0;
            info.supports_peak_calc = false;
        },
        else => {
            @compileError("Unsupported operating system");
        },
    }

    return info;
 }

 fn calculateTheoreticalPeak(info: CpuInfo) ?f64 {
    if (!info.supports_peak_calc) {
        return null;
    }

    const p_core_gflops = if (info.p_freq != null and info.p_cores != null)
        (@as(f64, @floatFromInt(info.p_freq.?)) / 1e9) *
            @as(f64, @floatFromInt(info.p_cores.? * info.simd_width * 2))
    else
        0;

    const e_core_gflops = if (info.e_freq != null and info.e_cores != null)
        (@as(f64, @floatFromInt(info.e_freq.?)) / 1e9) *
            @as(f64, @floatFromInt(info.e_cores.? * info.simd_width * 2))
    else
        0;

    return p_core_gflops + e_core_gflops;
 }

 pub fn calculatePreciseGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usize, iterations: usize) !void {
    const A = try allocator.alloc(f32, M * K);
    defer allocator.free(A);
    const B = try allocator.alloc(f32, K * N);
    defer allocator.free(B);
    const C = try allocator.alloc(f32, M * N);
    defer allocator.free(C);

    // Initialize matrices
    var rng = std.rand.DefaultPrng.init(@intCast(std.time.timestamp()));
    const random = rng.random();

    for (A) |*val| {
        val.* = random.float(f32) * 2.0 - 1.0;
    }
    for (B) |*val| {
        val.* = random.float(f32) * 2.0 - 1.0;
    }
    @memset(C, 0);

    // Warmup run
    try tiledMatMul(allocator, A, B, C, M, N, K);

    // Track performance metrics
    const ops_per_element = 2 * K - 1;
    const total_elements = M * N;
    const flops_per_iteration = total_elements * ops_per_element;

    var metrics = try allocator.alloc(struct {
        gflops: f64,
        time_ms: f64,
    }, iterations);
    defer allocator.free(metrics);

    // Main measurement loop
    var timer = try std.time.Timer.start();
    const total_start = timer.read();

    for (0..iterations) |i| {
        @memset(C, 0);
        const iter_start = timer.read();

        try tiledMatMul(allocator, A, B, C, M, N, K);

        const iter_end = timer.read();
        const iter_time_ns = iter_end - iter_start;
        const iter_time_ms = @as(f64, @floatFromInt(iter_time_ns)) / 1e6;
        const iter_gflops = @as(f64, @floatFromInt(flops_per_iteration)) / (@as(f64, @floatFromInt(iter_time_ns)) / 1e9) / 1e9;

        metrics[i] = .{
            .gflops = iter_gflops,
            .time_ms = iter_time_ms,
        };
    }

    // Calculate statistics
    const total_time_ns = timer.read() - total_start;
    const total_time_sec = @as(f64, @floatFromInt(total_time_ns)) / 1e9;

    var min_gflops: f64 = metrics[0].gflops;
    var max_gflops: f64 = metrics[0].gflops;
    var total_gflops: f64 = 0;
    var first_quarter_avg: f64 = 0;
    var last_quarter_avg: f64 = 0;
    const quarter_size = iterations / 4;

    for (metrics, 0..) |m, i| {
        min_gflops = @min(min_gflops, m.gflops);
        max_gflops = @max(max_gflops, m.gflops);
        total_gflops += m.gflops;

        if (i < quarter_size) {
            first_quarter_avg += m.gflops;
        } else if (i >= iterations - quarter_size) {
            last_quarter_avg += m.gflops;
        }
    }

    first_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
    last_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
    const avg_gflops = total_gflops / @as(f64, @floatFromInt(iterations));
    const perf_change_pct = ((last_quarter_avg - first_quarter_avg) / first_quarter_avg) * 100.0;

    // Get CPU info and theoretical peak
    const cpu_info = try getCpuFrequencies();

    // Print detailed performance report
    std.debug.print("\nPerformance Analysis for {d}x{d}x{d} Matrix:\n", .{ M, N, K });
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Peak Performance:     {d:.2} GFLOPS\n", .{max_gflops});
    std.debug.print("Lowest Performance:   {d:.2} GFLOPS\n", .{min_gflops});
    std.debug.print("Average Performance:  {d:.2} GFLOPS\n", .{avg_gflops});
    std.debug.print("First Quarter Avg:    {d:.2} GFLOPS\n", .{first_quarter_avg});
    std.debug.print("Last Quarter Avg:     {d:.2} GFLOPS\n", .{last_quarter_avg});
    if (perf_change_pct >= 0) {
        std.debug.print("Performance Change:   {d:.1}% (improvement)\n", .{perf_change_pct});
    } else {
        std.debug.print("Performance Change:   {d:.1}% (degradation)\n", .{perf_change_pct});
    }
    std.debug.print("Total Runtime:        {d:.3} seconds\n", .{total_time_sec});

    // CPU information and theoretical peak
    std.debug.print("\nCPU Configuration:\n", .{});
    std.debug.print("Total Cores: {d}\n", .{cpu_info.total_cores});
    if (cpu_info.p_cores != null) {
        std.debug.print("P-Cores: {d}\n", .{cpu_info.p_cores.?});
    }
    if (cpu_info.e_cores != null) {
        std.debug.print("E-Cores: {d}\n", .{cpu_info.e_cores.?});
    }
    std.debug.print("SIMD Width: {d} floats\n", .{cpu_info.simd_width});

    if (cpu_info.supports_peak_calc) {
        if (cpu_info.p_freq != null) {
            std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
        }
        if (cpu_info.e_freq != null) {
            std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
        }

        if (calculateTheoreticalPeak(cpu_info)) |peak| {
            std.debug.print("\nTheoretical Peak: {d:.2} GFLOPS\n", .{peak});
            if (peak > 0) {
                std.debug.print("Peak Efficiency: {d:.1}% of theoretical\n", .{(max_gflops / peak) * 100.0});
            }
        }
    } else {
        std.debug.print("\nNote: Theoretical peak calculation not supported on this OS\n", .{});
    }

    // Print performance distribution
    std.debug.print("\nPerformance Distribution:\n", .{});
    const distribution_width = 60;
    for (metrics) |m| {
        const normalized = (m.gflops - min_gflops) / (max_gflops - min_gflops);
        const stars = @as(usize, @intFromFloat(normalized * @as(f64, @floatFromInt(distribution_width))));
        std.debug.print("{d:.2} GFLOPS |", .{m.gflops});
        for (0..stars) |_| {
            std.debug.print("*", .{});
        }
        std.debug.print("\n", .{});
    }
 }

 pub fn main() !void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    const sizes = [_][3]usize{
        // Uncomment additional sizes as needed
        // .{ 256, 256, 256 },
        // .{ 512, 512, 512 },
        // .{ 1024, 1024, 1024 },
        // .{ 1024, 2048, 1024 },
        .{ 2048, 2048, 2048 },
    };

    const iterations = 100;

    // Print system configuration
    const cpu_info = try getCpuFrequencies();
    std.debug.print("\nSystem Configuration:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Tile size (T):        {}\n", .{T});
    std.debug.print("Vector size (V):      {}\n", .{V});
    std.debug.print("CPU Cores:            {}\n", .{cpu_info.total_cores});
    if (cpu_info.supports_peak_calc) {
        if (cpu_info.p_freq != null) {
            std.debug.print("P-Core Frequency:     {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
        }
        if (cpu_info.e_freq != null) {
            std.debug.print("E-Core Frequency:     {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
        }
    }
    std.debug.print("AVX2 Vector Width:    {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)});
    std.debug.print("Iterations per test:  {}\n", .{iterations});
    std.debug.print("----------------------------------------\n\n", .{});

    // Run benchmarks with increasing matrix sizes
    for (sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        std.debug.print("\n========================================\n", .{});
        std.debug.print("Running benchmark for {d}x{d}x{d} matrices\n", .{ M, N, K });
        std.debug.print("========================================\n", .{});

        try calculatePreciseGflops(allocator, M, N, K, iterations);

        // Add a small delay between tests to let CPU cool down
        std.time.sleep(1 * std.time.ns_per_s);
    }

    // Print summary
    std.debug.print("\n========================================\n", .{});
    std.debug.print("Benchmark Summary:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
    std.debug.print("Total iterations per size: {d}\n", .{iterations});
    std.debug.print("AVX2 SIMD utilized:        Yes\n", .{});
    std.debug.print("Tiled algorithm:           Yes ({}x{})\n", .{ T, T });
    std.debug.print("========================================\n\n", .{});
 }

 // Additional test for edge cases
 test "tiledMatMul edge cases" {
    const allocator = std.testing.allocator;

    // Test with sizes that are not multiples of tile size
    const edge_sizes = [_][3]usize{
        .{ T + 1, T - 1, T + 2 }, // Around tile size
        .{ T * 2 - 1, T * 2 + 1, T * 2 }, // Around double tile size
        .{ 1, T, T }, // Single row
        .{ T, 1, T }, // Single column
        .{ T, T, 1 }, // Single depth
    };

    for (edge_sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        const A = try allocator.alloc(f32, M * K);
        defer allocator.free(A);
        const B = try allocator.alloc(f32, K * N);
        defer allocator.free(B);
        const C = try allocator.alloc(f32, M * N);
        defer allocator.free(C);
        const C_ref = try allocator.alloc(f32, M * N);
        defer allocator.free(C_ref);

        // Initialize matrices
        for (A, 0..) |*val, i| {
            val.* = @floatFromInt(i % 7);
        }
        for (B, 0..) |*val, i| {
            val.* = @floatFromInt((i + 1) % 5);
        }
        @memset(C, 0);
        @memset(C_ref, 0);

        // Perform multiplications
        try tiledMatMul(allocator, A, B, C, M, N, K);

        // Reference calculation
        for (0..M) |i| {
            for (0..N) |j| {
                var sum: f32 = 0;
                for (0..K) |k| {
                    sum += A[i * K + k] * B[k * N + j];
                }
                C_ref[i * N + j] = sum;
            }
        }

        // Compare results with higher tolerance for edge cases
        for (C, C_ref) |c, c_ref| {
            try std.testing.expectApproxEqAbs(c, c_ref, 1e-4);
        }

        std.debug.print("Edge case test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
    }
 }

 // Example usage
 test "matrix multiplication" {
    const allocator = std.testing.allocator;

    const M: usize = 1024;
    const N: usize = 1024;
    const K: usize = 1024;

    var A = try allocator.alloc(f32, M * K);
    defer allocator.free(A);

    var B = try allocator.alloc(f32, K * N);
    defer allocator.free(B);

    const C = try allocator.alloc(f32, M * N);
    defer allocator.free(C);

    // Initialize matrices
    for (0..M * K) |i| {
        A[i] = @floatFromInt(i % 7);
    }
    for (0..K * N) |i| {
        B[i] = @floatFromInt(i % 5);
    }
    @memset(C, 0);

    try tiledMatMul(allocator, A, B, C, M, N, K);
 }

 // Test function
 test "tiledMatMul correctness" {
    const allocator = std.testing.allocator;

    const test_sizes = [_][3]usize{
        .{ 128, 128, 128 },
        .{ 100, 100, 100 },
        .{ 200, 150, 175 },
        .{ 32, 64, 48 },
        .{ 47, 34, 45 },
    };

    for (test_sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        const A = try allocator.alloc(f32, M * K);
        defer allocator.free(A);
        const B = try allocator.alloc(f32, K * N);
        defer allocator.free(B);
        const C = try allocator.alloc(f32, M * N);
        defer allocator.free(C);
        const C_ref = try allocator.alloc(f32, M * N);
        defer allocator.free(C_ref);

        // Initialize matrices
        for (A, 0..) |*val, i| {
            val.* = @floatFromInt(i % 10);
        }
        for (B, 0..) |*val, i| {
            val.* = @floatFromInt((i + 1) % 10);
        }
        @memset(C, 0);
        @memset(C_ref, 0);

        // Perform tiled matrix multiplication
        try tiledMatMul(allocator, A, B, C, M, N, K);

        // Perform reference matrix multiplication
        for (0..M) |i| {
            for (0..N) |j| {
                var sum: f32 = 0;
                for (0..K) |k| {
                    sum += A[i * K + k] * B[k * N + j];
                }
                C_ref[i * N + j] = sum;
            }
        }

        // Compare results
        for (C, C_ref) |c, c_ref| {
            try std.testing.expectApproxEqAbs(c, c_ref, 1e-5);
        }

        std.debug.print("Test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
    }
 }
diff --git a/matmulgridsearch.zig b/matmulgridsearch.zig
 const std = @import("std");
 const builtin = @import("builtin");
 const atomic = std.atomic;

 const CACHE_LINE_SIZE: usize = atomic.cache_line;
 const CHUNK_SIZE: usize = 1;
 const AVX2_ALIGNMENT = 32;
 const Vec8f = @Vector(8, f32);

 const ThreadLocalData = struct {
    current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
    _padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
 };

 pub fn tiledMatMul(
    allocator: std.mem.Allocator,
    A: []const f32,
    B: []const f32,
    C: []f32,
    M: usize,
    N: usize,
    K: usize,
    T: usize,
    V: usize,
 ) !void {
    const tiles_M = (M + T - 1) / T;
    const tiles_N = (N + T - 1) / T;
    const total_tiles = tiles_M * tiles_N;
    var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
    const num_threads = try std.Thread.getCpuCount();

    // Allocate thread pool
    var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
    defer thread_pool.deinit();

    // Allocate local C buffers for each thread
    const buffer_size = T * T;
    var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads);
    defer allocator.free(local_C_buffers);

    // Create and spawn threads
    for (0..num_threads) |thread_id| {
        const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size];
        const context = ThreadContext{
            .A = A,
            .B = B,
            .C = C,
            .M = M,
            .N = N,
            .K = K,
            .T = T,
            .V = V,
            .tiles_M = tiles_M,
            .tiles_N = tiles_N,
            .total_tiles = total_tiles,
            .shared_counter = &shared_data,
            .local_C = thread_buffer,
        };
        try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
    }

    // Wait for all threads to complete
    for (thread_pool.items) |thread| thread.join();
 }

 const ThreadContext = struct {
    A: []const f32,
    B: []const f32,
    C: []f32,
    M: usize,
    N: usize,
    K: usize,
    T: usize,
    V: usize,
    tiles_M: usize,
    tiles_N: usize,
    total_tiles: usize,
    shared_counter: *ThreadLocalData,
    local_C: []f32,
 };

 fn workerThread(ctx: ThreadContext) void {
    const max_tile_size = 512; // Maximum supported tile size
    var A_local: [max_tile_size * max_tile_size]f32 align(32) = undefined;
    var B_local: [max_tile_size * max_tile_size]f32 align(32) = undefined;

    if (ctx.T > max_tile_size) {
        @panic("Tile size exceeds maximum supported size");
    }

    while (true) {
        const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
        if (start_idx >= ctx.total_tiles) break;

        const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
        var idx = start_idx;
        while (idx < end_idx) : (idx += 1) {
            const i = idx / ctx.tiles_N;
            const j = idx % ctx.tiles_N;
            const i_start = i * ctx.T;
            const j_start = j * ctx.T;
            const i_end = @min(i_start + ctx.T, ctx.M);
            const j_end = @min(j_start + ctx.T, ctx.N);

            // Reset local_C
            @memset(ctx.local_C, 0);

            var k: usize = 0;
            while (k < ctx.K) : (k += ctx.T) {
                const k_end = @min(k + ctx.T, ctx.K);
                microKernelAVX2(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
            }

            // Update global C
            for (i_start..i_end) |ii| {
                const row_offset = ii * ctx.N;
                const local_row = (ii - i_start) * ctx.T;

                var j_idx = j_start;
                while (j_idx + ctx.V <= j_end) : (j_idx += ctx.V) {
                    if (ctx.V == 8) {
                        const vec_idx = j_idx - j_start;
                        const c_vec = Vec8f{
                            ctx.local_C[local_row + vec_idx],
                            ctx.local_C[local_row + vec_idx + 1],
                            ctx.local_C[local_row + vec_idx + 2],
                            ctx.local_C[local_row + vec_idx + 3],
                            ctx.local_C[local_row + vec_idx + 4],
                            ctx.local_C[local_row + vec_idx + 5],
                            ctx.local_C[local_row + vec_idx + 6],
                            ctx.local_C[local_row + vec_idx + 7],
                        };
                        const dest_vec = Vec8f{
                            ctx.C[row_offset + j_idx],
                            ctx.C[row_offset + j_idx + 1],
                            ctx.C[row_offset + j_idx + 2],
                            ctx.C[row_offset + j_idx + 3],
                            ctx.C[row_offset + j_idx + 4],
                            ctx.C[row_offset + j_idx + 5],
                            ctx.C[row_offset + j_idx + 6],
                            ctx.C[row_offset + j_idx + 7],
                        };
                        const result = dest_vec + c_vec;
                        for (0..8) |offset| {
                            ctx.C[row_offset + j_idx + offset] = result[offset];
                        }
                    } else {
                        // Handle other vector sizes
                        for (0..ctx.V) |offset| {
                            ctx.C[row_offset + j_idx + offset] +=
                                ctx.local_C[local_row + (j_idx - j_start) + offset];
                        }
                    }
                }

                // Handle remaining elements
                while (j_idx < j_end) : (j_idx += 1) {
                    ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)];
                }
            }
        }
    }
 }

 fn microKernelAVX2(
    ctx: ThreadContext,
    A_local: []f32,
    B_local: []f32,
    i_start: usize,
    j_start: usize,
    k_start: usize,
    i_end: usize,
    j_end: usize,
    k_end: usize,
 ) void {
    const k_size = k_end - k_start;
    const i_size = i_end - i_start;
    const j_size = j_end - j_start;

    // Load A and B tiles
    for (0..i_size) |i| {
        const src_idx = (i_start + i) * ctx.K + k_start;
        const local_idx = i * ctx.T;
        for (0..k_size) |k| {
            A_local[local_idx + k] = ctx.A[src_idx + k];
        }
    }

    for (0..k_size) |k| {
        const src_idx = (k_start + k) * ctx.N + j_start;
        const local_idx = k * ctx.T;
        for (0..j_size) |j| {
            B_local[local_idx + j] = ctx.B[src_idx + j];
        }
    }

    var i: usize = 0;
    while (i + ctx.V <= i_size) : (i += ctx.V) {
        var j: usize = 0;
        while (j + ctx.V <= j_size) : (j += ctx.V) {
            if (ctx.V == 8) {
                var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} ** 8} ** 8;

                for (0..k_size) |k| {
                    const a_vec = Vec8f{
                        A_local[i * ctx.T + k],
                        A_local[(i + 1) * ctx.T + k],
                        A_local[(i + 2) * ctx.T + k],
                        A_local[(i + 3) * ctx.T + k],
                        A_local[(i + 4) * ctx.T + k],
                        A_local[(i + 5) * ctx.T + k],
                        A_local[(i + 6) * ctx.T + k],
                        A_local[(i + 7) * ctx.T + k],
                    };

                    const b_vec = Vec8f{
                        B_local[k * ctx.T + j],
                        B_local[k * ctx.T + j + 1],
                        B_local[k * ctx.T + j + 2],
                        B_local[k * ctx.T + j + 3],
                        B_local[k * ctx.T + j + 4],
                        B_local[k * ctx.T + j + 5],
                        B_local[k * ctx.T + j + 6],
                        B_local[k * ctx.T + j + 7],
                    };

                    inline for (0..8) |bi| {
                        const a_broadcast: Vec8f = @splat(a_vec[bi]);
                        const c_vec = Vec8f{
                            acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
                            acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7],
                        };
                        const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec);
                        inline for (0..8) |bj| {
                            acc[bi][bj] = prod[bj];
                        }
                    }
                }

                // Store results
                for (0..8) |bi| {
                    for (0..8) |bj| {
                        const local_idx = (i + bi) * ctx.T + (j + bj);
                        ctx.local_C[local_idx] += acc[bi][bj];
                    }
                }
            } else {
                // Handle other vector sizes with scalar operations
                for (0..ctx.V) |bi| {
                    for (0..ctx.V) |bj| {
                        var sum: f32 = 0;
                        for (0..k_size) |k| {
                            sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j + bj], sum);
                        }
                        ctx.local_C[(i + bi) * ctx.T + j + bj] += sum;
                    }
                }
            }
        }

        // Handle remaining columns
        while (j < j_size) : (j += 1) {
            for (0..ctx.V) |bi| {
                var sum: f32 = 0;
                for (0..k_size) |k| {
                    sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j], sum);
                }
                ctx.local_C[(i + bi) * ctx.T + j] += sum;
            }
        }
    }

    // Handle remaining rows
    while (i < i_size) : (i += 1) {
        for (0..j_size) |j| {
            var sum: f32 = 0;
            for (0..k_size) |k| {
                sum = @mulAdd(f32, A_local[i * ctx.T + k], B_local[k * ctx.T + j], sum);
            }
            ctx.local_C[i * ctx.T + j] += sum;
        }
    }
 }

 const time = std.time;
 const testing = std.testing;

 pub const SearchResult = struct {
    tile_size: usize,
    vector_size: usize,
    gflops: f64,
 };

 const SearchRange = struct {
    t_min: usize,
    t_max: usize,
    t_step: usize,
    v_sizes: []const usize,
 };

 pub fn adaptiveGridSearch(
    allocator: std.mem.Allocator,
    M: usize,
    N: usize,
    K: usize,
    initial_range: SearchRange,
    refinement_levels: usize,
 ) !SearchResult {
    var best_result = SearchResult{
        .tile_size = 0,
        .vector_size = 0,
        .gflops = 0,
    };

    var current_range = initial_range;

    for (0..refinement_levels) |level| {
        std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step });
        std.debug.print("----------------------------------------\n", .{});

        // Search current range
        var results = std.ArrayList(SearchResult).init(allocator);
        defer results.deinit();

        var t = current_range.t_min;
        while (t <= current_range.t_max) : (t += current_range.t_step) {
            // Skip invalid tile sizes
            if (t > M or t > N or t > K) continue;

            for (current_range.v_sizes) |v| {
                if (t % v != 0) continue; // Skip if tile size not divisible by vector size

                // Run multiple trials and take best
                var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K);
                for (0..2) |_| {
                    const trial = try benchmarkMatMul(allocator, t, v, M, N, K);
                    if (trial.gflops > best_trial.gflops) {
                        best_trial.gflops = trial.gflops;
                    }
                }

                try results.append(.{
                    .tile_size = t,
                    .vector_size = v,
                    .gflops = best_trial.gflops,
                });

                std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops });
            }
        }

        // Find top performers
        const top_k = 3; // Number of top results to consider
        const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len));
        defer allocator.free(top_results);

        // Sort results by GFLOPS (descending)
        std.sort.pdq(SearchResult, results.items, {}, struct {
            fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
                return a.gflops > b.gflops;
            }
        }.lessThan);

        // Copy top results
        @memcpy(top_results, results.items[0..top_results.len]);

        // Update best overall result
        if (results.items[0].gflops > best_result.gflops) {
            best_result = results.items[0];
        }

        // Print top performers for this level
        std.debug.print("\nTop performers at level {d}:\n", .{level});
        for (top_results, 0..) |result, i| {
            std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops });
        }

        // Refine search range around best result
        if (level + 1 < refinement_levels) {
            const best_T = top_results[0].tile_size;
            const step = current_range.t_step / 2;
            current_range = SearchRange{
                .t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min,
                .t_max = @min(best_T + step * 2, current_range.t_max),
                .t_step = step,
                .v_sizes = current_range.v_sizes,
            };
        }
    }

    return best_result;
 }

 pub fn treeSearch(
    allocator: std.mem.Allocator,
    M: usize,
    N: usize,
    K: usize,
 ) !SearchResult {
    // Initial coarse-grained search
    const initial_range = SearchRange{
        .t_min = 32,
        .t_max = 512,
        .t_step = 64,
        .v_sizes = &[_]usize{ 4, 8, 16 },
    };

    std.debug.print("Starting tree search for {d}x{d}x{d} matrix\n", .{ M, N, K });
    std.debug.print("----------------------------------------\n", .{});

    const result = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4 // number of refinement levels
    );

    std.debug.print("\nFinal optimal configuration:\n", .{});
    std.debug.print("Tile Size (T): {d}\n", .{result.tile_size});
    std.debug.print("Vector Size (V): {d}\n", .{result.vector_size});
    std.debug.print("Performance: {d:.2} GFLOPS\n", .{result.gflops});

    return result;
 }

 pub fn benchmarkMatMul(
    allocator: std.mem.Allocator,
    tile_size: usize,
    vector_size: usize,
    M: usize,
    N: usize,
    K: usize,
 ) !SearchResult {
    const A = try allocator.alloc(f32, M * K);
    defer allocator.free(A);
    const B = try allocator.alloc(f32, K * N);
    defer allocator.free(B);
    const C = try allocator.alloc(f32, M * N);
    defer allocator.free(C);

    // Initialize matrices with random data
    var rng = std.rand.DefaultPrng.init(0);
    const random = rng.random();
    for (A) |*val| val.* = random.float(f32);
    for (B) |*val| val.* = random.float(f32);
    @memset(C, 0);

    // Warm up run
    try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);

    // Actual benchmark
    const start_time = time.nanoTimestamp();
    try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
    const end_time = time.nanoTimestamp();
    const elapsed_ns: u64 = @intCast(end_time - start_time);

    // Calculate GFLOPS
    const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add
    const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
    const gflops = operations / (seconds * 1e9);

    return SearchResult{
        .tile_size = tile_size,
        .vector_size = vector_size,
        .gflops = gflops,
    };
 }

 const CpuInfo = struct {
    p_freq: ?u64 = null,
    e_freq: ?u64 = null,
    p_cores: ?u32 = null,
    e_cores: ?u32 = null,
    simd_width: u32,
    total_cores: u32,
    supports_peak_calc: bool,
 };

 fn getSIMDWidth() u32 {
    if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
        return 16;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
        return 8;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
        return 4;
    } else {
        return 1;
    }
 }

 fn getCpuFrequencies() !CpuInfo {
    var info = CpuInfo{
        .simd_width = getSIMDWidth(),
        .total_cores = @intCast(try std.Thread.getCpuCount()),
        .supports_peak_calc = false,
    };

    switch (builtin.os.tag) {
        .linux => {
            // Try reading hybrid architecture core mapping
            if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| {
                defer p_core_file.close();
                var buf: [32]u8 = undefined;
                if (p_core_file.reader().readAll(&buf)) |bytes_read| {
                    const p_cores_str = buf[0..bytes_read];
                    // Count cores from range (e.g. "0-7" means 8 threads)
                    if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| {
                        const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
                        const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
                        info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
                    }
                } else |_| {}
            } else |_| {}

            if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| {
                defer e_core_file.close();
                var buf: [32]u8 = undefined;
                if (e_core_file.reader().readAll(&buf)) |bytes_read| {
                    const e_cores_str = buf[0..bytes_read];
                    // Count cores from range (e.g. "8-15" means 8 threads)
                    if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| {
                        const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
                        const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
                        info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
                    }
                } else |_| {}
            } else |_| {}

            // Try reading frequencies for P-cores
            if (info.p_cores != null) {
                if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| {
                    defer freq_file.close();
                    var buf: [32]u8 = undefined;
                    if (freq_file.reader().readAll(&buf)) |bytes_read| {
                        if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
                            info.p_freq = freq * 1000; // Convert kHz to Hz
                        } else |_| {
                            info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
                        }
                    } else |_| {}
                } else |_| {
                    info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
                }
            }

            // Try reading frequencies for E-cores
            if (info.e_cores != null) {
                if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| {
                    defer freq_file.close();
                    var buf: [32]u8 = undefined;
                    if (freq_file.reader().readAll(&buf)) |bytes_read| {
                        if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
                            info.e_freq = freq * 1000; // Convert kHz to Hz
                        } else |_| {
                            info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
                        }
                    } else |_| {}
                } else |_| {
                    info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
                }
            }

            // If we detected any cores, we can do peak calculation
            info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
                (info.e_cores != null and info.e_freq != null);

            // If we couldn't detect anything, fall back to treating all cores as P-cores
            if (!info.supports_peak_calc) {
                info.p_cores = info.total_cores;
                info.e_cores = 0;
                info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
                info.supports_peak_calc = true;
                std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
            }
        },
        .windows, .macos => {
            info.p_cores = info.total_cores;
            info.e_cores = 0;
            info.supports_peak_calc = false;
        },
        else => {
            @compileError("Unsupported operating system");
        },
    }

    return info;
 }

 pub fn main() !void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    const sizes = [_][3]usize{
        // Uncomment additional sizes as needed
        .{ 1024, 1024, 1024 },
        // .{ 2048, 2048, 2048 },
        // .{ 4096, 4096, 4096 },
    };

    const iterations = 5; // Reduced for tree search phase, increase for final benchmark

    // Print system configuration
    const cpu_info = try getCpuFrequencies();
    std.debug.print("\nSystem Configuration:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("CPU Cores:            {}\n", .{cpu_info.total_cores});
    if (cpu_info.supports_peak_calc) {
        if (cpu_info.p_freq != null) {
            std.debug.print("P-Core Frequency:     {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
        }
        if (cpu_info.e_freq != null) {
            std.debug.print("E-Core Frequency:     {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
        }
    }
    std.debug.print("AVX2 Vector Width:    {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)});
    std.debug.print("----------------------------------------\n\n", .{});

    for (sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        std.debug.print("\n========================================\n", .{});
        std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K });
        std.debug.print("========================================\n", .{});

        // Phase 1: Tree Search
        std.debug.print("\nAdaptive Parameter Search\n", .{});
        std.debug.print("----------------------------------------\n", .{});

        const best_config = try treeSearch(allocator, M, N, K);

        std.debug.print("\nBest configuration found:\n", .{});
        std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size});
        std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size});
        std.debug.print("Initial Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops});
    }

    // Print summary
    std.debug.print("\n========================================\n", .{});
    std.debug.print("Benchmark Summary:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
    std.debug.print("Tree search levels: 4\n", .{});
    std.debug.print("Detailed analysis iterations: {d}\n", .{iterations});
    std.debug.print("AVX2 SIMD utilized:        Yes\n", .{});
    std.debug.print("========================================\n\n", .{});
 }
	const std = @import("std");
	const builtin = @import("builtin");
	const atomic = std.atomic;

	const CACHE_LINE_SIZE: usize = atomic.cache_line;
	const CHUNK_SIZE: usize = 1;
	const AVX2_ALIGNMENT = 32;
	const Vec8f = @Vector(8, f32);

	const ThreadLocalData = struct {
	current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
	_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
	};

	pub fn tiledMatMul(
	allocator: std.mem.Allocator,
	A: []const f32,
	B: []const f32,
	C: []f32,
	M: usize,
	N: usize,
	K: usize,
	T: usize,
	V: usize,
	) !void {
	const tiles_M = (M + T - 1) / T;
	const tiles_N = (N + T - 1) / T;
	const total_tiles = tiles_M * tiles_N;
	var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
	const num_threads = try std.Thread.getCpuCount();

	// Allocate thread pool
	var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
	defer thread_pool.deinit();

	// Allocate local C buffers for each thread
	const buffer_size = T * T;
	var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads);
	defer allocator.free(local_C_buffers);

	// Create and spawn threads
	for (0..num_threads) \|thread_id\| {
	const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size];
	const context = ThreadContext{
	.A = A,
	.B = B,
	.C = C,
	.M = M,
	.N = N,
	.K = K,
	.T = T,
	.V = V,
	.tiles_M = tiles_M,
	.tiles_N = tiles_N,
	.total_tiles = total_tiles,
	.shared_counter = &shared_data,
	.local_C = thread_buffer,
	};
	try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
	}

	// Wait for all threads to complete
	for (thread_pool.items) \|thread\| thread.join();
	}

	const ThreadContext = struct {
	A: []const f32,
	B: []const f32,
	C: []f32,
	M: usize,
	N: usize,
	K: usize,
	T: usize,
	V: usize,
	tiles_M: usize,
	tiles_N: usize,
	total_tiles: usize,
	shared_counter: *ThreadLocalData,
	local_C: []f32,
	};

	fn workerThread(ctx: ThreadContext) void {
	const max_tile_size = 512; // Maximum supported tile size
	var A_local: [max_tile_size * max_tile_size]f32 align(32) = undefined;
	var B_local: [max_tile_size * max_tile_size]f32 align(32) = undefined;

	if (ctx.T > max_tile_size) {
	@panic("Tile size exceeds maximum supported size");
	}

	while (true) {
	const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
	if (start_idx >= ctx.total_tiles) break;

	const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
	var idx = start_idx;
	while (idx < end_idx) : (idx += 1) {
	const i = idx / ctx.tiles_N;
	const j = idx % ctx.tiles_N;
	const i_start = i * ctx.T;
	const j_start = j * ctx.T;
	const i_end = @min(i_start + ctx.T, ctx.M);
	const j_end = @min(j_start + ctx.T, ctx.N);

	// Reset local_C
	@memset(ctx.local_C, 0);

	var k: usize = 0;
	while (k < ctx.K) : (k += ctx.T) {
	const k_end = @min(k + ctx.T, ctx.K);
	microKernelAVX2(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
	}

	// Update global C
	for (i_start..i_end) \|ii\| {
	const row_offset = ii * ctx.N;
	const local_row = (ii - i_start) * ctx.T;

	var j_idx = j_start;
	while (j_idx + ctx.V <= j_end) : (j_idx += ctx.V) {
	if (ctx.V == 8) {
	const vec_idx = j_idx - j_start;
	const c_vec = Vec8f{
	ctx.local_C[local_row + vec_idx],
	ctx.local_C[local_row + vec_idx + 1],
	ctx.local_C[local_row + vec_idx + 2],
	ctx.local_C[local_row + vec_idx + 3],
	ctx.local_C[local_row + vec_idx + 4],
	ctx.local_C[local_row + vec_idx + 5],
	ctx.local_C[local_row + vec_idx + 6],
	ctx.local_C[local_row + vec_idx + 7],
	};
	const dest_vec = Vec8f{
	ctx.C[row_offset + j_idx],
	ctx.C[row_offset + j_idx + 1],
	ctx.C[row_offset + j_idx + 2],
	ctx.C[row_offset + j_idx + 3],
	ctx.C[row_offset + j_idx + 4],
	ctx.C[row_offset + j_idx + 5],
	ctx.C[row_offset + j_idx + 6],
	ctx.C[row_offset + j_idx + 7],
	};
	const result = dest_vec + c_vec;
	for (0..8) \|offset\| {
	ctx.C[row_offset + j_idx + offset] = result[offset];
	}
	} else {
	// Handle other vector sizes
	for (0..ctx.V) \|offset\| {
	ctx.C[row_offset + j_idx + offset] +=
	ctx.local_C[local_row + (j_idx - j_start) + offset];
	}
	}
	}

	// Handle remaining elements
	while (j_idx < j_end) : (j_idx += 1) {
	ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)];
	}
	}
	}
	}
	}

	fn microKernelAVX2(
	ctx: ThreadContext,
	A_local: []f32,
	B_local: []f32,
	i_start: usize,
	j_start: usize,
	k_start: usize,
	i_end: usize,
	j_end: usize,
	k_end: usize,
	) void {
	const k_size = k_end - k_start;
	const i_size = i_end - i_start;
	const j_size = j_end - j_start;

	// Load A and B tiles
	for (0..i_size) \|i\| {
	const src_idx = (i_start + i) * ctx.K + k_start;
	const local_idx = i * ctx.T;
	for (0..k_size) \|k\| {
	A_local[local_idx + k] = ctx.A[src_idx + k];
	}
	}

	for (0..k_size) \|k\| {
	const src_idx = (k_start + k) * ctx.N + j_start;
	const local_idx = k * ctx.T;
	for (0..j_size) \|j\| {
	B_local[local_idx + j] = ctx.B[src_idx + j];
	}
	}

	var i: usize = 0;
	while (i + ctx.V <= i_size) : (i += ctx.V) {
	var j: usize = 0;
	while (j + ctx.V <= j_size) : (j += ctx.V) {
	if (ctx.V == 8) {
	var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} 8} 8;

	for (0..k_size) \|k\| {
	const a_vec = Vec8f{
	A_local[i * ctx.T + k],
	A_local[(i + 1) * ctx.T + k],
	A_local[(i + 2) * ctx.T + k],
	A_local[(i + 3) * ctx.T + k],
	A_local[(i + 4) * ctx.T + k],
	A_local[(i + 5) * ctx.T + k],
	A_local[(i + 6) * ctx.T + k],
	A_local[(i + 7) * ctx.T + k],
	};

	const b_vec = Vec8f{
	B_local[k * ctx.T + j],
	B_local[k * ctx.T + j + 1],
	B_local[k * ctx.T + j + 2],
	B_local[k * ctx.T + j + 3],
	B_local[k * ctx.T + j + 4],
	B_local[k * ctx.T + j + 5],
	B_local[k * ctx.T + j + 6],
	B_local[k * ctx.T + j + 7],
	};

	inline for (0..8) \|bi\| {
	const a_broadcast: Vec8f = @splat(a_vec[bi]);
	const c_vec = Vec8f{
	acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
	acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7],
	};
	const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec);
	inline for (0..8) \|bj\| {
	acc[bi][bj] = prod[bj];
	}
	}
	}

	// Store results
	for (0..8) \|bi\| {
	for (0..8) \|bj\| {
	const local_idx = (i + bi) * ctx.T + (j + bj);
	ctx.local_C[local_idx] += acc[bi][bj];
	}
	}
	} else {
	// Handle other vector sizes with scalar operations
	for (0..ctx.V) \|bi\| {
	for (0..ctx.V) \|bj\| {
	var sum: f32 = 0;
	for (0..k_size) \|k\| {
	sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j + bj], sum);
	}
	ctx.local_C[(i + bi) * ctx.T + j + bj] += sum;
	}
	}
	}
	}

	// Handle remaining columns
	while (j < j_size) : (j += 1) {
	for (0..ctx.V) \|bi\| {
	var sum: f32 = 0;
	for (0..k_size) \|k\| {
	sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j], sum);
	}
	ctx.local_C[(i + bi) * ctx.T + j] += sum;
	}
	}
	}

	// Handle remaining rows
	while (i < i_size) : (i += 1) {
	for (0..j_size) \|j\| {
	var sum: f32 = 0;
	for (0..k_size) \|k\| {
	sum = @mulAdd(f32, A_local[i * ctx.T + k], B_local[k * ctx.T + j], sum);
	}
	ctx.local_C[i * ctx.T + j] += sum;
	}
	}
	}

	const time = std.time;
	const testing = std.testing;

	pub const SearchResult = struct {
	tile_size: usize,
	vector_size: usize,
	gflops: f64,
	};

	const SearchRange = struct {
	t_min: usize,
	t_max: usize,
	t_step: usize,
	v_sizes: []const usize,
	};

	pub fn adaptiveGridSearch(
	allocator: std.mem.Allocator,
	M: usize,
	N: usize,
	K: usize,
	initial_range: SearchRange,
	refinement_levels: usize,
	) !SearchResult {
	var best_result = SearchResult{
	.tile_size = 0,
	.vector_size = 0,
	.gflops = 0,
	};

	var current_range = initial_range;

	for (0..refinement_levels) \|level\| {
	std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step });
	std.debug.print("----------------------------------------\n", .{});

	// Search current range
	var results = std.ArrayList(SearchResult).init(allocator);
	defer results.deinit();

	var t = current_range.t_min;
	while (t <= current_range.t_max) : (t += current_range.t_step) {
	// Skip invalid tile sizes
	if (t > M or t > N or t > K) continue;

	for (current_range.v_sizes) \|v\| {
	if (t % v != 0) continue; // Skip if tile size not divisible by vector size

	// Run multiple trials and take best
	var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K);
	for (0..2) \|_\| {
	const trial = try benchmarkMatMul(allocator, t, v, M, N, K);
	if (trial.gflops > best_trial.gflops) {
	best_trial.gflops = trial.gflops;
	}
	}

	try results.append(.{
	.tile_size = t,
	.vector_size = v,
	.gflops = best_trial.gflops,
	});

	std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops });
	}
	}

	// Find top performers
	const top_k = 3; // Number of top results to consider
	const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len));
	defer allocator.free(top_results);

	// Sort results by GFLOPS (descending)
	std.sort.pdq(SearchResult, results.items, {}, struct {
	fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
	return a.gflops > b.gflops;
	}
	}.lessThan);

	// Copy top results
	@memcpy(top_results, results.items[0..top_results.len]);

	// Update best overall result
	if (results.items[0].gflops > best_result.gflops) {
	best_result = results.items[0];
	}

	// Print top performers for this level
	std.debug.print("\nTop performers at level {d}:\n", .{level});
	for (top_results, 0..) \|result, i\| {
	std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops });
	}

	// Refine search range around best result
	if (level + 1 < refinement_levels) {
	const best_T = top_results[0].tile_size;
	const step = current_range.t_step / 2;
	current_range = SearchRange{
	.t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min,
	.t_max = @min(best_T + step * 2, current_range.t_max),
	.t_step = step,
	.v_sizes = current_range.v_sizes,
	};
	}
	}

	return best_result;
	}

	pub fn treeSearch(
	allocator: std.mem.Allocator,
	M: usize,
	N: usize,
	K: usize,
	) !SearchResult {
	// Initial coarse-grained search
	const initial_range = SearchRange{
	.t_min = 32,
	.t_max = 512,
	.t_step = 64,
	.v_sizes = &[_]usize{ 4, 8, 16 },
	};

	std.debug.print("Starting tree search for {d}x{d}x{d} matrix\n", .{ M, N, K });
	std.debug.print("----------------------------------------\n", .{});

	const result = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4 // number of refinement levels
	);

	std.debug.print("\nFinal optimal configuration:\n", .{});
	std.debug.print("Tile Size (T): {d}\n", .{result.tile_size});
	std.debug.print("Vector Size (V): {d}\n", .{result.vector_size});
	std.debug.print("Performance: {d:.2} GFLOPS\n", .{result.gflops});

	return result;
	}

	pub fn benchmarkMatMul(
	allocator: std.mem.Allocator,
	tile_size: usize,
	vector_size: usize,
	M: usize,
	N: usize,
	K: usize,
	) !SearchResult {
	const A = try allocator.alloc(f32, M * K);
	defer allocator.free(A);
	const B = try allocator.alloc(f32, K * N);
	defer allocator.free(B);
	const C = try allocator.alloc(f32, M * N);
	defer allocator.free(C);

	// Initialize matrices with random data
	var rng = std.rand.DefaultPrng.init(0);
	const random = rng.random();
	for (A) \|val\| val. = random.float(f32);
	for (B) \|val\| val. = random.float(f32);
	@memset(C, 0);

	// Warm up run
	try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);

	// Actual benchmark
	const start_time = time.nanoTimestamp();
	try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
	const end_time = time.nanoTimestamp();
	const elapsed_ns: u64 = @intCast(end_time - start_time);

	// Calculate GFLOPS
	const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add
	const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
	const gflops = operations / (seconds * 1e9);

	return SearchResult{
	.tile_size = tile_size,
	.vector_size = vector_size,
	.gflops = gflops,
	};
	}

	const CpuInfo = struct {
	p_freq: ?u64 = null,
	e_freq: ?u64 = null,
	p_cores: ?u32 = null,
	e_cores: ?u32 = null,
	simd_width: u32,
	total_cores: u32,
	supports_peak_calc: bool,
	};

	fn getSIMDWidth() u32 {
	if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
	return 16;
	} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
	return 8;
	} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
	return 4;
	} else {
	return 1;
	}
	}

	fn getCpuFrequencies() !CpuInfo {
	var info = CpuInfo{
	.simd_width = getSIMDWidth(),
	.total_cores = @intCast(try std.Thread.getCpuCount()),
	.supports_peak_calc = false,
	};

	switch (builtin.os.tag) {
	.linux => {
	// Try reading hybrid architecture core mapping
	if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) \|p_core_file\| {
	defer p_core_file.close();
	var buf: [32]u8 = undefined;
	if (p_core_file.reader().readAll(&buf)) \|bytes_read\| {
	const p_cores_str = buf[0..bytes_read];
	// Count cores from range (e.g. "0-7" means 8 threads)
	if (std.mem.indexOf(u8, p_cores_str, "-")) \|dash_idx\| {
	const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
	const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
	info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
	}
	} else \|_\| {}
	} else \|_\| {}

	if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) \|e_core_file\| {
	defer e_core_file.close();
	var buf: [32]u8 = undefined;
	if (e_core_file.reader().readAll(&buf)) \|bytes_read\| {
	const e_cores_str = buf[0..bytes_read];
	// Count cores from range (e.g. "8-15" means 8 threads)
	if (std.mem.indexOf(u8, e_cores_str, "-")) \|dash_idx\| {
	const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
	const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
	info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
	}
	} else \|_\| {}
	} else \|_\| {}

	// Try reading frequencies for P-cores
	if (info.p_cores != null) {
	if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) \|freq_file\| {
	defer freq_file.close();
	var buf: [32]u8 = undefined;
	if (freq_file.reader().readAll(&buf)) \|bytes_read\| {
	if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) \|freq\| {
	info.p_freq = freq * 1000; // Convert kHz to Hz
	} else \|_\| {
	info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
	}
	} else \|_\| {}
	} else \|_\| {
	info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
	}
	}

	// Try reading frequencies for E-cores
	if (info.e_cores != null) {
	if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) \|freq_file\| {
	defer freq_file.close();
	var buf: [32]u8 = undefined;
	if (freq_file.reader().readAll(&buf)) \|bytes_read\| {
	if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) \|freq\| {
	info.e_freq = freq * 1000; // Convert kHz to Hz
	} else \|_\| {
	info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
	}
	} else \|_\| {}
	} else \|_\| {
	info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
	}
	}

	// If we detected any cores, we can do peak calculation
	info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
	(info.e_cores != null and info.e_freq != null);

	// If we couldn't detect anything, fall back to treating all cores as P-cores
	if (!info.supports_peak_calc) {
	info.p_cores = info.total_cores;
	info.e_cores = 0;
	info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
	info.supports_peak_calc = true;
	std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
	}
	},
	.windows, .macos => {
	info.p_cores = info.total_cores;
	info.e_cores = 0;
	info.supports_peak_calc = false;
	},
	else => {
	@compileError("Unsupported operating system");
	},
	}

	return info;
	}

	pub fn main() !void {
	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
	defer arena.deinit();
	const allocator = arena.allocator();

	const sizes = [_][3]usize{
	// Uncomment additional sizes as needed
	.{ 1024, 1024, 1024 },
	// .{ 2048, 2048, 2048 },
	// .{ 4096, 4096, 4096 },
	};

	const iterations = 5; // Reduced for tree search phase, increase for final benchmark

	// Print system configuration
	const cpu_info = try getCpuFrequencies();
	std.debug.print("\nSystem Configuration:\n", .{});
	std.debug.print("----------------------------------------\n", .{});
	std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores});
	if (cpu_info.supports_peak_calc) {
	if (cpu_info.p_freq != null) {
	std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
	}
	if (cpu_info.e_freq != null) {
	std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
	}
	}
	std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)});
	std.debug.print("----------------------------------------\n\n", .{});

	for (sizes) \|size\| {
	const M = size[0];
	const N = size[1];
	const K = size[2];

	std.debug.print("\n========================================\n", .{});
	std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K });
	std.debug.print("========================================\n", .{});

	// Phase 1: Tree Search
	std.debug.print("\nAdaptive Parameter Search\n", .{});
	std.debug.print("----------------------------------------\n", .{});

	const best_config = try treeSearch(allocator, M, N, K);

	std.debug.print("\nBest configuration found:\n", .{});
	std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size});
	std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size});
	std.debug.print("Initial Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops});
	}

	// Print summary
	std.debug.print("\n========================================\n", .{});
	std.debug.print("Benchmark Summary:\n", .{});
	std.debug.print("----------------------------------------\n", .{});
	std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
	std.debug.print("Tree search levels: 4\n", .{});
	std.debug.print("Detailed analysis iterations: {d}\n", .{iterations});
	std.debug.print("AVX2 SIMD utilized: Yes\n", .{});
	std.debug.print("========================================\n\n", .{});
	}