snowclipsed · January 14, 2025 20:23
diff --git a/matmulgridsearchmacos.zig b/matmulgridsearchmacos.zig
 const std = @import("std");
 const builtin = @import("builtin");
 const time = std.time;
 const atomic = std.atomic;

 const CACHE_LINE_SIZE: usize = atomic.cache_line;
 const CHUNK_SIZE: usize = 1;
 const NEON_ALIGNMENT = 16;
 // For ARM NEON, we use 4-wide vectors as that's the native SIMD width
 const Vec4f = @Vector(4, f32);

 const ThreadLocalData = struct {
    current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
    _padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
 };

 // Main matrix multiplication logic remains similar, but with NEON-specific microkernel
 pub fn tiledMatMul(
    allocator: std.mem.Allocator,
    A: []const f32,
    B: []const f32,
    C: []f32,
    M: usize,
    N: usize,
    K: usize,
    T: usize,
    V: usize,
 ) !void {
    const tiles_M = (M + T - 1) / T;
    const tiles_N = (N + T - 1) / T;
    const total_tiles = tiles_M * tiles_N;
    var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
    const num_threads = try std.Thread.getCpuCount();

    // Allocate thread pool
    var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
    defer thread_pool.deinit();

    // Allocate local C buffers for each thread
    const buffer_size = T * T;
    var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads);
    defer allocator.free(local_C_buffers);

    // For NEON alignment
    const aligned_T = (T + 3) & ~@as(usize, 3); // Align to 4-float boundary

    // Create and spawn threads
    for (0..num_threads) |thread_id| {
        const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size];
        const context = ThreadContext{
            .A = A,
            .B = B,
            .C = C,
            .M = M,
            .N = N,
            .K = K,
            .T = T,
            .V = V,
            .aligned_T = aligned_T,
            .tiles_M = tiles_M,
            .tiles_N = tiles_N,
            .total_tiles = total_tiles,
            .shared_counter = &shared_data,
            .local_C = thread_buffer,
        };
        try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
    }

    // Wait for all threads to complete
    for (thread_pool.items) |thread| thread.join();
 }

 const ThreadContext = struct {
    A: []const f32,
    B: []const f32,
    C: []f32,
    M: usize,
    N: usize,
    K: usize,
    T: usize,
    V: usize,
    aligned_T: usize,
    tiles_M: usize,
    tiles_N: usize,
    total_tiles: usize,
    shared_counter: *ThreadLocalData,
    local_C: []f32,
 };

 fn workerThread(ctx: ThreadContext) void {
    const max_tile_size = 256; // Maximum supported tile size for ARM
    var A_local: [max_tile_size * max_tile_size]f32 align(16) = undefined;
    var B_local: [max_tile_size * max_tile_size]f32 align(16) = undefined;

    if (ctx.T > max_tile_size) {
        @panic("Tile size exceeds maximum supported size");
    }

    while (true) {
        const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
        if (start_idx >= ctx.total_tiles) break;

        const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
        var idx = start_idx;
        while (idx < end_idx) : (idx += 1) {
            const i = idx / ctx.tiles_N;
            const j = idx % ctx.tiles_N;
            const i_start = i * ctx.T;
            const j_start = j * ctx.T;
            const i_end = @min(i_start + ctx.T, ctx.M);
            const j_end = @min(j_start + ctx.T, ctx.N);

            // Reset local_C
            @memset(ctx.local_C, 0);

            var k: usize = 0;
            while (k < ctx.K) : (k += ctx.T) {
                const k_end = @min(k + ctx.T, ctx.K);

                if (builtin.cpu.arch == .aarch64) {
                    microKernelNEON(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
                } else {
                    microKernelScalar(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
                }
            }

            // Update global C using NEON vectorized accumulation when possible
            for (i_start..i_end) |ii| {
                const row_offset = ii * ctx.N;
                const local_row = (ii - i_start) * ctx.aligned_T;

                var j_idx = j_start;
                while (j_idx + 4 <= j_end) : (j_idx += 4) {
                    if (builtin.cpu.arch == .aarch64) {
                        // Load 4 elements at once using NEON
                        const vec_idx = j_idx - j_start;
                        const c_vec = Vec4f{
                            ctx.local_C[local_row + vec_idx],
                            ctx.local_C[local_row + vec_idx + 1],
                            ctx.local_C[local_row + vec_idx + 2],
                            ctx.local_C[local_row + vec_idx + 3],
                        };
                        const dest_vec = Vec4f{
                            ctx.C[row_offset + j_idx],
                            ctx.C[row_offset + j_idx + 1],
                            ctx.C[row_offset + j_idx + 2],
                            ctx.C[row_offset + j_idx + 3],
                        };
                        const result = dest_vec + c_vec;

                        // Store result back
                        for (0..4) |offset| {
                            ctx.C[row_offset + j_idx + offset] = result[offset];
                        }
                    } else {
                        // Fallback scalar path
                        for (0..4) |offset| {
                            ctx.C[row_offset + j_idx + offset] +=
                                ctx.local_C[local_row + (j_idx - j_start) + offset];
                        }
                    }
                }

                // Handle remaining elements
                while (j_idx < j_end) : (j_idx += 1) {
                    ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)];
                }
            }
        }
    }
 }

 fn microKernelScalar(
    ctx: ThreadContext,
    A_local: []f32,
    B_local: []f32,
    i_start: usize,
    j_start: usize,
    k_start: usize,
    i_end: usize,
    j_end: usize,
    k_end: usize,
 ) void {
    const k_size = k_end - k_start;
    const i_size = i_end - i_start;
    const j_size = j_end - j_start;

    // Load tiles
    for (0..i_size) |i| {
        const src_idx = (i_start + i) * ctx.K + k_start;
        const local_idx = i * ctx.aligned_T;
        for (0..k_size) |k| {
            A_local[local_idx + k] = ctx.A[src_idx + k];
        }
    }

    for (0..k_size) |k| {
        const src_idx = (k_start + k) * ctx.N + j_start;
        const local_idx = k * ctx.aligned_T;
        for (0..j_size) |j| {
            B_local[local_idx + j] = ctx.B[src_idx + j];
        }
    }

    // Compute using scalar operations
    for (0..i_size) |i| {
        for (0..j_size) |j| {
            var sum: f32 = 0;
            for (0..k_size) |k| {
                sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
            }
            ctx.local_C[i * ctx.aligned_T + j] += sum;
        }
    }
 }

 fn microKernelNEON(
    ctx: ThreadContext,
    A_local: []f32,
    B_local: []f32,
    i_start: usize,
    j_start: usize,
    k_start: usize,
    i_end: usize,
    j_end: usize,
    k_end: usize,
 ) void {
    const k_size = k_end - k_start;
    const i_size = i_end - i_start;
    const j_size = j_end - j_start;

    // Load tiles with NEON alignment
    for (0..i_size) |i| {
        const src_idx = (i_start + i) * ctx.K + k_start;
        const local_idx = i * ctx.aligned_T;
        var k: usize = 0;

        // Vector load where possible
        while (k + 4 <= k_size) : (k += 4) {
            const vec = Vec4f{
                ctx.A[src_idx + k],
                ctx.A[src_idx + k + 1],
                ctx.A[src_idx + k + 2],
                ctx.A[src_idx + k + 3],
            };
            for (0..4) |offset| {
                A_local[local_idx + k + offset] = vec[offset];
            }
        }

        // Handle remaining elements
        while (k < k_size) : (k += 1) {
            A_local[local_idx + k] = ctx.A[src_idx + k];
        }
    }

    // Load B tile with NEON alignment
    for (0..k_size) |k| {
        const src_idx = (k_start + k) * ctx.N + j_start;
        const local_idx = k * ctx.aligned_T;
        var j: usize = 0;

        // Vector load where possible
        while (j + 4 <= j_size) : (j += 4) {
            const vec = Vec4f{
                ctx.B[src_idx + j],
                ctx.B[src_idx + j + 1],
                ctx.B[src_idx + j + 2],
                ctx.B[src_idx + j + 3],
            };
            for (0..4) |offset| {
                B_local[local_idx + j + offset] = vec[offset];
            }
        }

        // Handle remaining elements
        while (j < j_size) : (j += 1) {
            B_local[local_idx + j] = ctx.B[src_idx + j];
        }
    }

    // Process 4x4 blocks using NEON
    var i: usize = 0;
    while (i + 4 <= i_size) : (i += 4) {
        var j: usize = 0;
        while (j + 4 <= j_size) : (j += 4) {
            var acc: [4][4]f32 align(16) = [_][4]f32{[_]f32{0} ** 4} ** 4;

            for (0..k_size) |k| {
                const a_vec = Vec4f{
                    A_local[(i + 0) * ctx.aligned_T + k],
                    A_local[(i + 1) * ctx.aligned_T + k],
                    A_local[(i + 2) * ctx.aligned_T + k],
                    A_local[(i + 3) * ctx.aligned_T + k],
                };

                const b_vec = Vec4f{
                    B_local[k * ctx.aligned_T + j],
                    B_local[k * ctx.aligned_T + j + 1],
                    B_local[k * ctx.aligned_T + j + 2],
                    B_local[k * ctx.aligned_T + j + 3],
                };

                // NEON FMA operations
                inline for (0..4) |bi| {
                    const a_broadcast: Vec4f = @splat(a_vec[bi]);
                    const c_vec = Vec4f{
                        acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
                    };
                    const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec);
                    inline for (0..4) |bj| {
                        acc[bi][bj] = prod[bj];
                    }
                }
            }

            // Store accumulated results
            for (0..4) |bi| {
                for (0..4) |bj| {
                    ctx.local_C[(i + bi) * ctx.aligned_T + (j + bj)] += acc[bi][bj];
                }
            }
        }

        // Handle remaining columns
        while (j < j_size) : (j += 1) {
            for (0..4) |bi| {
                var sum: f32 = 0;
                for (0..k_size) |k| {
                    sum = @mulAdd(f32, A_local[(i + bi) * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
                }
                ctx.local_C[(i + bi) * ctx.aligned_T + j] += sum;
            }
        }
    }

    // Handle remaining rows
    while (i < i_size) : (i += 1) {
        for (0..j_size) |j| {
            var sum: f32 = 0;
            for (0..k_size) |k| {
                sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
            }
            ctx.local_C[i * ctx.aligned_T + j] += sum;
        }
    }
 }

 const CpuInfo = struct {
    total_cores: u32,
    simd_width: u32,
    supports_peak_calc: bool = false,
    max_freq: ?u64 = null,
    is_arm: bool,
 };

 fn getSIMDWidth() u32 {
    if (builtin.cpu.arch == .aarch64) {
        // ARM NEON typically works with 128-bit vectors (4 x float32)
        return 4;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
        return 16;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
        return 8;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
        return 4;
    } else {
        return 1;
    }
 }

 const SearchRange = struct {
    t_min: usize,
    t_max: usize,
    t_step: usize,
    v_sizes: []const usize,
 };

 pub const SearchResult = struct {
    tile_size: usize,
    vector_size: usize,
    gflops: f64,
 };

 pub fn adaptiveGridSearch(
    allocator: std.mem.Allocator,
    M: usize,
    N: usize,
    K: usize,
    initial_range: SearchRange,
    refinement_levels: usize,
 ) !SearchResult {
    var best_result = SearchResult{
        .tile_size = 0,
        .vector_size = 0,
        .gflops = 0,
    };

    var current_range = initial_range;

    for (0..refinement_levels) |level| {
        std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step });
        std.debug.print("----------------------------------------\n", .{});

        // Search current range
        var results = std.ArrayList(SearchResult).init(allocator);
        defer results.deinit();

        var t = current_range.t_min;
        while (t <= current_range.t_max) : (t += current_range.t_step) {
            // Skip invalid tile sizes
            if (t > M or t > N or t > K) continue;

            for (current_range.v_sizes) |v| {
                if (t % v != 0) continue; // Skip if tile size not divisible by vector size

                // Run multiple trials and take best
                var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K);
                for (0..2) |_| {
                    const trial = try benchmarkMatMul(allocator, t, v, M, N, K);
                    if (trial.gflops > best_trial.gflops) {
                        best_trial.gflops = trial.gflops;
                    }
                }

                try results.append(.{
                    .tile_size = t,
                    .vector_size = v,
                    .gflops = best_trial.gflops,
                });

                std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops });
            }
        }

        // Find top performers
        const top_k = 3; // Number of top results to consider
        const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len));
        defer allocator.free(top_results);

        // Sort results by GFLOPS (descending)
        std.sort.pdq(SearchResult, results.items, {}, struct {
            fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
                return a.gflops > b.gflops;
            }
        }.lessThan);

        // Copy top results
        @memcpy(top_results, results.items[0..top_results.len]);

        // Update best overall result
        if (results.items[0].gflops > best_result.gflops) {
            best_result = results.items[0];
        }

        // Print top performers for this level
        std.debug.print("\nTop performers at level {d}:\n", .{level});
        for (top_results, 0..) |result, i| {
            std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops });
        }

        // Refine search range around best result
        if (level + 1 < refinement_levels) {
            const best_T = top_results[0].tile_size;
            const step = current_range.t_step / 2;
            current_range = SearchRange{
                .t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min,
                .t_max = @min(best_T + step * 2, current_range.t_max),
                .t_step = step,
                .v_sizes = current_range.v_sizes,
            };
        }
    }

    return best_result;
 }

 pub fn benchmarkMatMul(
    allocator: std.mem.Allocator,
    tile_size: usize,
    vector_size: usize,
    M: usize,
    N: usize,
    K: usize,
 ) !SearchResult {
    const A = try allocator.alloc(f32, M * K);
    defer allocator.free(A);
    const B = try allocator.alloc(f32, K * N);
    defer allocator.free(B);
    const C = try allocator.alloc(f32, M * N);
    defer allocator.free(C);

    // Initialize matrices with random data
    var rng = std.rand.DefaultPrng.init(0);
    const random = rng.random();
    for (A) |*val| val.* = random.float(f32);
    for (B) |*val| val.* = random.float(f32);
    @memset(C, 0);

    // Warm up run
    try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);

    // Actual benchmark
    const start_time = time.nanoTimestamp();
    try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
    const end_time = time.nanoTimestamp();
    const elapsed_ns: u64 = @intCast(end_time - start_time);

    // Calculate GFLOPS
    const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add
    const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
    const gflops = operations / (seconds * 1e9);

    return SearchResult{
        .tile_size = tile_size,
        .vector_size = vector_size,
        .gflops = gflops,
    };
 }

 fn getCpuFrequencies() !CpuInfo {
    var info = CpuInfo{
        .simd_width = getSIMDWidth(),
        .total_cores = @intCast(try std.Thread.getCpuCount()),
        .is_arm = (builtin.cpu.arch == .aarch64),
    };

    if (builtin.os.tag == .macos) {
        // On MacOS, we'll use conservative defaults since we can't easily get CPU frequency
        if (info.is_arm) {
            info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2
        } else {
            info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel
        }
        info.supports_peak_calc = true;

        // Default frequencies if detection fails
        if (info.max_freq == null) {
            if (info.is_arm) {
                info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2
            } else {
                info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel
            }
            info.supports_peak_calc = true;
            std.debug.print("Warning: Could not detect CPU frequency. Using default for {s}\n", .{if (info.is_arm) "ARM" else "Intel"});
        }
    }

    return info;
 }

 pub fn main() !void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    const sizes = [_][3]usize{
        .{ 1024, 1024, 1024 },
        // .{ 2048, 2048, 2048 },
        // .{ 4096, 4096, 4096 },
    };

    const iterations = 5;

    // Print system configuration
    const cpu_info = try getCpuFrequencies();
    std.debug.print("\nSystem Configuration:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Architecture:         {s}\n", .{if (cpu_info.is_arm) "ARM" else "x86"});
    std.debug.print("CPU Cores:            {}\n", .{cpu_info.total_cores});
    if (cpu_info.supports_peak_calc and cpu_info.max_freq != null) {
        std.debug.print("CPU Frequency:        {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.max_freq.?)) / 1e9});
    }
    std.debug.print("SIMD Width:          {}\n", .{cpu_info.simd_width});
    std.debug.print("----------------------------------------\n\n", .{});

    // Modified search range for ARM
    const initial_range = SearchRange{
        .t_min = 16, // Smaller minimum tile size for ARM
        .t_max = 256, // Smaller maximum tile size due to different cache hierarchy
        .t_step = 32,
        .v_sizes = &[_]usize{4}, // Only use vector size 4 for NEON
    };

    for (sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        std.debug.print("\n========================================\n", .{});
        std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K });
        std.debug.print("========================================\n", .{});

        const best_config = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4);

        std.debug.print("\nBest configuration found:\n", .{});
        std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size});
        std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size});
        std.debug.print("Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops});
    }

    // Print summary
    std.debug.print("\n========================================\n", .{});
    std.debug.print("Benchmark Summary:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
    std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM (NEON)" else "x86"});
    std.debug.print("Tree search levels: 4\n", .{});
    std.debug.print("Iterations: {d}\n", .{iterations});
    std.debug.print("========================================\n\n", .{});
 }
diff --git a/matmulmacos b/matmulmacos
 const std = @import("std");
 const builtin = @import("builtin");
 const atomic = std.atomic;

 ///////////////////////////////////////////////////////////////////////////////
 // Configuration Constants
 ///////////////////////////////////////////////////////////////////////////////

 /// Tile size optimized for Apple Silicon cache hierarchy:
 /// - L1D cache: 128KB (M1/M2/M3)
 /// - L2 cache: 4MB-12MB (varies by model)
 /// Larger tile size than Intel due to bigger L1 cache
 const T: usize = 256;

 /// Vector width for NEON SIMD (128 bits = 4 floats)
 const V: usize = 4;

 /// Size of CPU cache line in bytes
 const CACHE_LINE_SIZE: usize = atomic.cache_line;

 /// Number of tiles per atomic operation
 const CHUNK_SIZE: usize = 1;

 /// Alignment for NEON operations (128-bit = 16 bytes)
 const NEON_ALIGNMENT = 16;

 /// Vector type for 4 32-bit floats (NEON register size)
 const Vec4f = @Vector(4, f32);

 ///////////////////////////////////////////////////////////////////////////////
 // Thread Management Structures
 ///////////////////////////////////////////////////////////////////////////////

 const ThreadLocalData = struct {
    current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
    _padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
 };

 const ThreadContext = struct {
    A: []const f32,
    B: []const f32,
    C: []f32,
    M: usize,
    N: usize,
    K: usize,
    tiles_M: usize,
    tiles_N: usize,
    total_tiles: usize,
    shared_counter: *ThreadLocalData,
 };

 ///////////////////////////////////////////////////////////////////////////////
 // Main Matrix Multiplication Function
 ///////////////////////////////////////////////////////////////////////////////

 pub fn tiledMatMul(allocator: std.mem.Allocator, A: []const f32, B: []const f32, C: []f32, M: usize, N: usize, K: usize) !void {
    const tiles_M = (M + T - 1) / T;
    const tiles_N = (N + T - 1) / T;
    const total_tiles = tiles_M * tiles_N;

    var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };

    // Apple Silicon chips have high-performance and efficiency cores
    // Use performance cores count for better results
    const num_threads = try std.Thread.getCpuCount() / 2;

    var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
    defer thread_pool.deinit();

    for (0..num_threads) |_| {
        const context = ThreadContext{
            .A = A,
            .B = B,
            .C = C,
            .M = M,
            .N = N,
            .K = K,
            .tiles_M = tiles_M,
            .tiles_N = tiles_N,
            .total_tiles = total_tiles,
            .shared_counter = &shared_data,
        };
        try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
    }

    for (thread_pool.items) |thread| thread.join();
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Worker Thread Implementation
 ///////////////////////////////////////////////////////////////////////////////

 fn workerThread(ctx: ThreadContext) void {
    var local_C: [T][T]f32 align(NEON_ALIGNMENT) = undefined;

    while (true) {
        const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
        if (start_idx >= ctx.total_tiles) break;

        const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
        var idx = start_idx;

        while (idx < end_idx) : (idx += 1) {
            const i = idx / ctx.tiles_N;
            const j = idx % ctx.tiles_N;

            const i_start = i * T;
            const j_start = j * T;
            const i_end = @min(i_start + T, ctx.M);
            const j_end = @min(j_start + T, ctx.N);

            for (0..T) |x| {
                @memset(&local_C[x], 0);
            }

            var k: usize = 0;
            while (k < ctx.K) : (k += T) {
                const k_end = @min(k + T, ctx.K);
                microKernelNEON(ctx, &local_C, i_start, j_start, k, i_end, j_end, k_end);
            }

            // Write results back using NEON vectorized stores
            for (i_start..i_end) |ii| {
                const row_offset = ii * ctx.N;
                const local_row = ii - i_start;

                var j_idx = j_start;
                // Process 4 elements at a time using NEON
                while (j_idx + 4 <= j_end) : (j_idx += 4) {
                    const vec_idx = j_idx - j_start;

                    // Load 4 elements into NEON vector
                    const c_vec = Vec4f{
                        local_C[local_row][vec_idx],
                        local_C[local_row][vec_idx + 1],
                        local_C[local_row][vec_idx + 2],
                        local_C[local_row][vec_idx + 3],
                    };

                    // Load destination elements
                    const dest_vec = Vec4f{
                        ctx.C[row_offset + j_idx],
                        ctx.C[row_offset + j_idx + 1],
                        ctx.C[row_offset + j_idx + 2],
                        ctx.C[row_offset + j_idx + 3],
                    };

                    // Add vectors and store results
                    const result = dest_vec + c_vec;
                    for (0..4) |offset| {
                        ctx.C[row_offset + j_idx + offset] = result[offset];
                    }
                }

                // Handle remaining elements
                while (j_idx < j_end) : (j_idx += 1) {
                    ctx.C[row_offset + j_idx] += local_C[local_row][j_idx - j_start];
                }
            }
        }
    }
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Micro-Kernel Implementation
 ///////////////////////////////////////////////////////////////////////////////

 fn microKernelNEON(
    ctx: ThreadContext,
    local_C: *[T][T]f32,
    i_start: usize,
    j_start: usize,
    k_start: usize,
    i_end: usize,
    j_end: usize,
    k_end: usize,
 ) void {
    var A_local: [T][T]f32 align(NEON_ALIGNMENT) = undefined;
    var B_local: [T][T]f32 align(NEON_ALIGNMENT) = undefined;

    const k_size = k_end - k_start;
    const i_size = i_end - i_start;
    const j_size = j_end - j_start;

    // Load tiles with prefetching
    for (0..i_size) |i| {
        const src_idx = (i_start + i) * ctx.K + k_start;
        for (0..k_size) |k| {
            A_local[i][k] = ctx.A[src_idx + k];
        }
    }

    for (0..k_size) |k| {
        const src_idx = (k_start + k) * ctx.N + j_start;
        for (0..j_size) |j| {
            B_local[k][j] = ctx.B[src_idx + j];
        }
    }

    // Process 4x4 blocks using NEON SIMD
    var i: usize = 0;
    while (i + 4 <= i_size) : (i += 4) {
        var j: usize = 0;
        while (j + 4 <= j_size) : (j += 4) {
            var acc: [4][4]f32 align(NEON_ALIGNMENT) = [_][4]f32{[_]f32{0} ** 4} ** 4;

            // Inner loop with NEON operations
            var k: usize = 0;
            while (k < k_size) : (k += 1) {
                // Load 4 elements from A matrix
                const a_vec = Vec4f{
                    A_local[i][k],
                    A_local[i + 1][k],
                    A_local[i + 2][k],
                    A_local[i + 3][k],
                };

                // Load 4 elements from B matrix
                const b_vec = Vec4f{
                    B_local[k][j],
                    B_local[k][j + 1],
                    B_local[k][j + 2],
                    B_local[k][j + 3],
                };

                // Vectorized outer product
                inline for (0..4) |bi| {
                    const a_broadcast: Vec4f = @splat(a_vec[bi]);
                    const c_vec = Vec4f{
                        acc[bi][0],
                        acc[bi][1],
                        acc[bi][2],
                        acc[bi][3],
                    };
                    const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec);
                    inline for (0..4) |bj| {
                        acc[bi][bj] = prod[bj];
                    }
                }
            }

            // Store results
            for (0..4) |bi| {
                for (0..4) |bj| {
                    local_C[i + bi][j + bj] += acc[bi][bj];
                }
            }
        }

        // Handle edge case columns
        while (j < j_size) : (j += 1) {
            for (0..4) |bi| {
                var sum: f32 = 0;
                for (0..k_size) |k| {
                    sum = @mulAdd(f32, A_local[i + bi][k], B_local[k][j], sum);
                }
                local_C[i + bi][j] += sum;
            }
        }
    }

    // Handle edge case rows
    while (i < i_size) : (i += 1) {
        for (0..j_size) |j| {
            var sum: f32 = 0;
            for (0..k_size) |k| {
                sum = @mulAdd(f32, A_local[i][k], B_local[k][j], sum);
            }
            local_C[i][j] += sum;
        }
    }
 }
 /// ------ BENCHMARKING AND PERFORMANCE ANALYSIS ------ ///
 const CpuInfo = struct {
    p_freq: ?u64 = null,
    e_freq: ?u64 = null,
    p_cores: ?u32 = null,
    e_cores: ?u32 = null,
    simd_width: u32,
    total_cores: u32,
    supports_peak_calc: bool,
 };

 fn getSIMDWidth() u32 {
    if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
        return 16;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
        return 8;
    } else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
        return 4;
    } else {
        return 1;
    }
 }

 fn getCpuFrequencies() !CpuInfo {
    var info = CpuInfo{
        .simd_width = getSIMDWidth(),
        .total_cores = @intCast(try std.Thread.getCpuCount()),
        .supports_peak_calc = false,
    };

    switch (builtin.os.tag) {
        .linux => {
            // Try reading hybrid architecture core mapping
            if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| {
                defer p_core_file.close();
                var buf: [32]u8 = undefined;
                if (p_core_file.reader().readAll(&buf)) |bytes_read| {
                    const p_cores_str = buf[0..bytes_read];
                    // Count cores from range (e.g. "0-7" means 8 threads)
                    if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| {
                        const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
                        const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
                        info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
                    }
                } else |_| {}
            } else |_| {}

            if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| {
                defer e_core_file.close();
                var buf: [32]u8 = undefined;
                if (e_core_file.reader().readAll(&buf)) |bytes_read| {
                    const e_cores_str = buf[0..bytes_read];
                    // Count cores from range (e.g. "8-15" means 8 threads)
                    if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| {
                        const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
                        const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
                        info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
                    }
                } else |_| {}
            } else |_| {}

            // Try reading frequencies for P-cores
            if (info.p_cores != null) {
                if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| {
                    defer freq_file.close();
                    var buf: [32]u8 = undefined;
                    if (freq_file.reader().readAll(&buf)) |bytes_read| {
                        if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
                            info.p_freq = freq * 1000; // Convert kHz to Hz
                        } else |_| {
                            info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
                        }
                    } else |_| {}
                } else |_| {
                    info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
                }
            }

            // Try reading frequencies for E-cores
            if (info.e_cores != null) {
                if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| {
                    defer freq_file.close();
                    var buf: [32]u8 = undefined;
                    if (freq_file.reader().readAll(&buf)) |bytes_read| {
                        if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
                            info.e_freq = freq * 1000; // Convert kHz to Hz
                        } else |_| {
                            info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
                        }
                    } else |_| {}
                } else |_| {
                    info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
                }
            }

            // If we detected any cores, we can do peak calculation
            info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
                (info.e_cores != null and info.e_freq != null);

            // If we couldn't detect anything, fall back to treating all cores as P-cores
            if (!info.supports_peak_calc) {
                info.p_cores = info.total_cores;
                info.e_cores = 0;
                info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
                info.supports_peak_calc = true;
                std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
            }
        },
        .windows, .macos => {
            info.p_cores = info.total_cores;
            info.e_cores = 0;
            info.supports_peak_calc = false;
        },
        else => {
            @compileError("Unsupported operating system");
        },
    }

    return info;
 }

 fn calculateTheoreticalPeak(info: CpuInfo) ?f64 {
    if (!info.supports_peak_calc) {
        return null;
    }

    const p_core_gflops = if (info.p_freq != null and info.p_cores != null)
        (@as(f64, @floatFromInt(info.p_freq.?)) / 1e9) *
            @as(f64, @floatFromInt(info.p_cores.? * info.simd_width * 2))
    else
        0;

    const e_core_gflops = if (info.e_freq != null and info.e_cores != null)
        (@as(f64, @floatFromInt(info.e_freq.?)) / 1e9) *
            @as(f64, @floatFromInt(info.e_cores.? * info.simd_width * 2))
    else
        0;

    return p_core_gflops + e_core_gflops;
 }

 pub fn calculatePreciseGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usize, iterations: usize) !void {
    const A = try allocator.alloc(f32, M * K);
    defer allocator.free(A);
    const B = try allocator.alloc(f32, K * N);
    defer allocator.free(B);
    const C = try allocator.alloc(f32, M * N);
    defer allocator.free(C);

    // Initialize matrices
    var rng = std.rand.DefaultPrng.init(@intCast(std.time.timestamp()));
    const random = rng.random();

    for (A) |*val| {
        val.* = random.float(f32) * 2.0 - 1.0;
    }
    for (B) |*val| {
        val.* = random.float(f32) * 2.0 - 1.0;
    }
    @memset(C, 0);

    // Warmup run
    try tiledMatMul(allocator, A, B, C, M, N, K);

    // Track performance metrics
    const ops_per_element = 2 * K - 1;
    const total_elements = M * N;
    const flops_per_iteration = total_elements * ops_per_element;

    var metrics = try allocator.alloc(struct {
        gflops: f64,
        time_ms: f64,
    }, iterations);
    defer allocator.free(metrics);

    // Main measurement loop
    var timer = try std.time.Timer.start();
    const total_start = timer.read();

    for (0..iterations) |i| {
        @memset(C, 0);
        const iter_start = timer.read();

        try tiledMatMul(allocator, A, B, C, M, N, K);

        const iter_end = timer.read();
        const iter_time_ns = iter_end - iter_start;
        const iter_time_ms = @as(f64, @floatFromInt(iter_time_ns)) / 1e6;
        const iter_gflops = @as(f64, @floatFromInt(flops_per_iteration)) / (@as(f64, @floatFromInt(iter_time_ns)) / 1e9) / 1e9;

        metrics[i] = .{
            .gflops = iter_gflops,
            .time_ms = iter_time_ms,
        };
    }

    // Calculate statistics
    const total_time_ns = timer.read() - total_start;
    const total_time_sec = @as(f64, @floatFromInt(total_time_ns)) / 1e9;

    var min_gflops: f64 = metrics[0].gflops;
    var max_gflops: f64 = metrics[0].gflops;
    var total_gflops: f64 = 0;
    var first_quarter_avg: f64 = 0;
    var last_quarter_avg: f64 = 0;
    const quarter_size = iterations / 4;

    for (metrics, 0..) |m, i| {
        min_gflops = @min(min_gflops, m.gflops);
        max_gflops = @max(max_gflops, m.gflops);
        total_gflops += m.gflops;

        if (i < quarter_size) {
            first_quarter_avg += m.gflops;
        } else if (i >= iterations - quarter_size) {
            last_quarter_avg += m.gflops;
        }
    }

    first_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
    last_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
    const avg_gflops = total_gflops / @as(f64, @floatFromInt(iterations));
    const perf_change_pct = ((last_quarter_avg - first_quarter_avg) / first_quarter_avg) * 100.0;

    // Get CPU info and theoretical peak
    const cpu_info = try getCpuFrequencies();

    // Print detailed performance report
    std.debug.print("\nPerformance Analysis for {d}x{d}x{d} Matrix:\n", .{ M, N, K });
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Peak Performance:     {d:.2} GFLOPS\n", .{max_gflops});
    std.debug.print("Lowest Performance:   {d:.2} GFLOPS\n", .{min_gflops});
    std.debug.print("Average Performance:  {d:.2} GFLOPS\n", .{avg_gflops});
    std.debug.print("First Quarter Avg:    {d:.2} GFLOPS\n", .{first_quarter_avg});
    std.debug.print("Last Quarter Avg:     {d:.2} GFLOPS\n", .{last_quarter_avg});
    if (perf_change_pct >= 0) {
        std.debug.print("Performance Change:   {d:.1}% (improvement)\n", .{perf_change_pct});
    } else {
        std.debug.print("Performance Change:   {d:.1}% (degradation)\n", .{perf_change_pct});
    }
    std.debug.print("Total Runtime:        {d:.3} seconds\n", .{total_time_sec});

    // CPU information and theoretical peak
    std.debug.print("\nCPU Configuration:\n", .{});
    std.debug.print("Total Cores: {d}\n", .{cpu_info.total_cores});
    if (cpu_info.p_cores != null) {
        std.debug.print("P-Cores: {d}\n", .{cpu_info.p_cores.?});
    }
    if (cpu_info.e_cores != null) {
        std.debug.print("E-Cores: {d}\n", .{cpu_info.e_cores.?});
    }
    std.debug.print("SIMD Width: {d} floats\n", .{cpu_info.simd_width});

    if (cpu_info.supports_peak_calc) {
        if (cpu_info.p_freq != null) {
            std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
        }
        if (cpu_info.e_freq != null) {
            std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
        }

        if (calculateTheoreticalPeak(cpu_info)) |peak| {
            std.debug.print("\nTheoretical Peak: {d:.2} GFLOPS\n", .{peak});
            if (peak > 0) {
                std.debug.print("Peak Efficiency: {d:.1}% of theoretical\n", .{(max_gflops / peak) * 100.0});
            }
        }
    } else {
        std.debug.print("\nNote: Theoretical peak calculation not supported on this OS\n", .{});
    }

    // Print performance distribution
    std.debug.print("\nPerformance Distribution:\n", .{});
    const distribution_width = 60;
    for (metrics) |m| {
        const normalized = (m.gflops - min_gflops) / (max_gflops - min_gflops);
        const stars = @as(usize, @intFromFloat(normalized * @as(f64, @floatFromInt(distribution_width))));
        std.debug.print("{d:.2} GFLOPS |", .{m.gflops});
        for (0..stars) |_| {
            std.debug.print("*", .{});
        }
        std.debug.print("\n", .{});
    }
 }

 pub fn main() !void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    const sizes = [_][3]usize{
        // Uncomment additional sizes as needed
        .{ 256, 256, 256 },
        .{ 512, 512, 512 },
        .{ 1024, 1024, 1024 },
        .{ 1024, 2048, 1024 },
        .{ 1500, 1500, 1500 },
        .{ 2048, 2048, 2048 },
    };

    const iterations = 10;

    // Print system configuration
    const cpu_info = try getCpuFrequencies();
    std.debug.print("\nSystem Configuration:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Tile size (T):        {}\n", .{T});
    std.debug.print("Vector size (V):      {}\n", .{V});
    std.debug.print("CPU Cores:            {}\n", .{cpu_info.total_cores});
    if (cpu_info.supports_peak_calc) {
        if (cpu_info.p_freq != null) {
            std.debug.print("P-Core Frequency:     {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
        }
        if (cpu_info.e_freq != null) {
            std.debug.print("E-Core Frequency:     {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
        }
    }
    std.debug.print("AVX2 Vector Width:    {}\n", .{@sizeOf(Vec4f) / @sizeOf(f32)});
    std.debug.print("Iterations per test:  {}\n", .{iterations});
    std.debug.print("----------------------------------------\n\n", .{});

    // Run benchmarks with increasing matrix sizes
    for (sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        std.debug.print("\n========================================\n", .{});
        std.debug.print("Running benchmark for {d}x{d}x{d} matrices\n", .{ M, N, K });
        std.debug.print("========================================\n", .{});

        try calculatePreciseGflops(allocator, M, N, K, iterations);

        // Add a small delay between tests to let CPU cool down
        std.time.sleep(1 * std.time.ns_per_s);
    }

    // Print summary
    std.debug.print("\n========================================\n", .{});
    std.debug.print("Benchmark Summary:\n", .{});
    std.debug.print("----------------------------------------\n", .{});
    std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
    std.debug.print("Total iterations per size: {d}\n", .{iterations});
    std.debug.print("AVX2 SIMD utilized:        Yes\n", .{});
    std.debug.print("Tiled algorithm:           Yes ({}x{})\n", .{ T, T });
    std.debug.print("========================================\n\n", .{});
 }

 // Additional test for edge cases
 test "tiledMatMul edge cases" {
    const allocator = std.testing.allocator;

    // Test with sizes that are not multiples of tile size
    const edge_sizes = [_][3]usize{
        .{ T + 1, T - 1, T + 2 }, // Around tile size
        .{ T * 2 - 1, T * 2 + 1, T * 2 }, // Around double tile size
        .{ 1, T, T }, // Single row
        .{ T, 1, T }, // Single column
        .{ T, T, 1 }, // Single depth
    };

    for (edge_sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        const A = try allocator.alloc(f32, M * K);
        defer allocator.free(A);
        const B = try allocator.alloc(f32, K * N);
        defer allocator.free(B);
        const C = try allocator.alloc(f32, M * N);
        defer allocator.free(C);
        const C_ref = try allocator.alloc(f32, M * N);
        defer allocator.free(C_ref);

        // Initialize matrices
        for (A, 0..) |*val, i| {
            val.* = @floatFromInt(i % 7);
        }
        for (B, 0..) |*val, i| {
            val.* = @floatFromInt((i + 1) % 5);
        }
        @memset(C, 0);
        @memset(C_ref, 0);

        // Perform multiplications
        try tiledMatMul(allocator, A, B, C, M, N, K);

        // Reference calculation
        for (0..M) |i| {
            for (0..N) |j| {
                var sum: f32 = 0;
                for (0..K) |k| {
                    sum += A[i * K + k] * B[k * N + j];
                }
                C_ref[i * N + j] = sum;
            }
        }

        // Compare results with higher tolerance for edge cases
        for (C, C_ref) |c, c_ref| {
            try std.testing.expectApproxEqAbs(c, c_ref, 1e-4);
        }

        std.debug.print("Edge case test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
    }
 }

 // Example usage
 test "matrix multiplication" {
    const allocator = std.testing.allocator;

    const M: usize = 1024;
    const N: usize = 1024;
    const K: usize = 1024;

    var A = try allocator.alloc(f32, M * K);
    defer allocator.free(A);

    var B = try allocator.alloc(f32, K * N);
    defer allocator.free(B);

    const C = try allocator.alloc(f32, M * N);
    defer allocator.free(C);

    // Initialize matrices
    for (0..M * K) |i| {
        A[i] = @floatFromInt(i % 7);
    }
    for (0..K * N) |i| {
        B[i] = @floatFromInt(i % 5);
    }
    @memset(C, 0);

    try tiledMatMul(allocator, A, B, C, M, N, K);
 }

 // Test function
 test "tiledMatMul correctness" {
    const allocator = std.testing.allocator;

    const test_sizes = [_][3]usize{
        .{ 128, 128, 128 },
        .{ 100, 100, 100 },
        .{ 200, 150, 175 },
        .{ 32, 64, 48 },
        .{ 47, 34, 45 },
    };

    for (test_sizes) |size| {
        const M = size[0];
        const N = size[1];
        const K = size[2];

        const A = try allocator.alloc(f32, M * K);
        defer allocator.free(A);
        const B = try allocator.alloc(f32, K * N);
        defer allocator.free(B);
        const C = try allocator.alloc(f32, M * N);
        defer allocator.free(C);
        const C_ref = try allocator.alloc(f32, M * N);
        defer allocator.free(C_ref);

        // Initialize matrices
        for (A, 0..) |*val, i| {
            val.* = @floatFromInt(i % 10);
        }
        for (B, 0..) |*val, i| {
            val.* = @floatFromInt((i + 1) % 10);
        }
        @memset(C, 0);
        @memset(C_ref, 0);

        // Perform tiled matrix multiplication
        try tiledMatMul(allocator, A, B, C, M, N, K);

        // Perform reference matrix multiplication
        for (0..M) |i| {
            for (0..N) |j| {
                var sum: f32 = 0;
                for (0..K) |k| {
                    sum += A[i * K + k] * B[k * N + j];
                }
                C_ref[i * N + j] = sum;
            }
        }

        // Compare results
        for (C, C_ref) |c, c_ref| {
            try std.testing.expectApproxEqAbs(c, c_ref, 1e-5);
        }

        std.debug.print("Test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
    }
 }
	const std = @import("std");
	const builtin = @import("builtin");
	const time = std.time;
	const atomic = std.atomic;

	const CACHE_LINE_SIZE: usize = atomic.cache_line;
	const CHUNK_SIZE: usize = 1;
	const NEON_ALIGNMENT = 16;
	// For ARM NEON, we use 4-wide vectors as that's the native SIMD width
	const Vec4f = @Vector(4, f32);

	const ThreadLocalData = struct {
	current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
	_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
	};

	// Main matrix multiplication logic remains similar, but with NEON-specific microkernel
	pub fn tiledMatMul(
	allocator: std.mem.Allocator,
	A: []const f32,
	B: []const f32,
	C: []f32,
	M: usize,
	N: usize,
	K: usize,
	T: usize,
	V: usize,
	) !void {
	const tiles_M = (M + T - 1) / T;
	const tiles_N = (N + T - 1) / T;
	const total_tiles = tiles_M * tiles_N;
	var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
	const num_threads = try std.Thread.getCpuCount();

	// Allocate thread pool
	var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
	defer thread_pool.deinit();

	// Allocate local C buffers for each thread
	const buffer_size = T * T;
	var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads);
	defer allocator.free(local_C_buffers);

	// For NEON alignment
	const aligned_T = (T + 3) & ~@as(usize, 3); // Align to 4-float boundary

	// Create and spawn threads
	for (0..num_threads) \|thread_id\| {
	const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size];
	const context = ThreadContext{
	.A = A,
	.B = B,
	.C = C,
	.M = M,
	.N = N,
	.K = K,
	.T = T,
	.V = V,
	.aligned_T = aligned_T,
	.tiles_M = tiles_M,
	.tiles_N = tiles_N,
	.total_tiles = total_tiles,
	.shared_counter = &shared_data,
	.local_C = thread_buffer,
	};
	try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
	}

	// Wait for all threads to complete
	for (thread_pool.items) \|thread\| thread.join();
	}

	const ThreadContext = struct {
	A: []const f32,
	B: []const f32,
	C: []f32,
	M: usize,
	N: usize,
	K: usize,
	T: usize,
	V: usize,
	aligned_T: usize,
	tiles_M: usize,
	tiles_N: usize,
	total_tiles: usize,
	shared_counter: *ThreadLocalData,
	local_C: []f32,
	};

	fn workerThread(ctx: ThreadContext) void {
	const max_tile_size = 256; // Maximum supported tile size for ARM
	var A_local: [max_tile_size * max_tile_size]f32 align(16) = undefined;
	var B_local: [max_tile_size * max_tile_size]f32 align(16) = undefined;

	if (ctx.T > max_tile_size) {
	@panic("Tile size exceeds maximum supported size");
	}

	while (true) {
	const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
	if (start_idx >= ctx.total_tiles) break;

	const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
	var idx = start_idx;
	while (idx < end_idx) : (idx += 1) {
	const i = idx / ctx.tiles_N;
	const j = idx % ctx.tiles_N;
	const i_start = i * ctx.T;
	const j_start = j * ctx.T;
	const i_end = @min(i_start + ctx.T, ctx.M);
	const j_end = @min(j_start + ctx.T, ctx.N);

	// Reset local_C
	@memset(ctx.local_C, 0);

	var k: usize = 0;
	while (k < ctx.K) : (k += ctx.T) {
	const k_end = @min(k + ctx.T, ctx.K);

	if (builtin.cpu.arch == .aarch64) {
	microKernelNEON(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
	} else {
	microKernelScalar(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
	}
	}

	// Update global C using NEON vectorized accumulation when possible
	for (i_start..i_end) \|ii\| {
	const row_offset = ii * ctx.N;
	const local_row = (ii - i_start) * ctx.aligned_T;

	var j_idx = j_start;
	while (j_idx + 4 <= j_end) : (j_idx += 4) {
	if (builtin.cpu.arch == .aarch64) {
	// Load 4 elements at once using NEON
	const vec_idx = j_idx - j_start;
	const c_vec = Vec4f{
	ctx.local_C[local_row + vec_idx],
	ctx.local_C[local_row + vec_idx + 1],
	ctx.local_C[local_row + vec_idx + 2],
	ctx.local_C[local_row + vec_idx + 3],
	};
	const dest_vec = Vec4f{
	ctx.C[row_offset + j_idx],
	ctx.C[row_offset + j_idx + 1],
	ctx.C[row_offset + j_idx + 2],
	ctx.C[row_offset + j_idx + 3],
	};
	const result = dest_vec + c_vec;

	// Store result back
	for (0..4) \|offset\| {
	ctx.C[row_offset + j_idx + offset] = result[offset];
	}
	} else {
	// Fallback scalar path
	for (0..4) \|offset\| {
	ctx.C[row_offset + j_idx + offset] +=
	ctx.local_C[local_row + (j_idx - j_start) + offset];
	}
	}
	}

	// Handle remaining elements
	while (j_idx < j_end) : (j_idx += 1) {
	ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)];
	}
	}
	}
	}
	}

	fn microKernelScalar(
	ctx: ThreadContext,
	A_local: []f32,
	B_local: []f32,
	i_start: usize,
	j_start: usize,
	k_start: usize,
	i_end: usize,
	j_end: usize,
	k_end: usize,
	) void {
	const k_size = k_end - k_start;
	const i_size = i_end - i_start;
	const j_size = j_end - j_start;

	// Load tiles
	for (0..i_size) \|i\| {
	const src_idx = (i_start + i) * ctx.K + k_start;
	const local_idx = i * ctx.aligned_T;
	for (0..k_size) \|k\| {
	A_local[local_idx + k] = ctx.A[src_idx + k];
	}
	}

	for (0..k_size) \|k\| {
	const src_idx = (k_start + k) * ctx.N + j_start;
	const local_idx = k * ctx.aligned_T;
	for (0..j_size) \|j\| {
	B_local[local_idx + j] = ctx.B[src_idx + j];
	}
	}

	// Compute using scalar operations
	for (0..i_size) \|i\| {
	for (0..j_size) \|j\| {
	var sum: f32 = 0;
	for (0..k_size) \|k\| {
	sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
	}
	ctx.local_C[i * ctx.aligned_T + j] += sum;
	}
	}
	}

	fn microKernelNEON(
	ctx: ThreadContext,
	A_local: []f32,
	B_local: []f32,
	i_start: usize,
	j_start: usize,
	k_start: usize,
	i_end: usize,
	j_end: usize,
	k_end: usize,
	) void {
	const k_size = k_end - k_start;
	const i_size = i_end - i_start;
	const j_size = j_end - j_start;

	// Load tiles with NEON alignment
	for (0..i_size) \|i\| {
	const src_idx = (i_start + i) * ctx.K + k_start;
	const local_idx = i * ctx.aligned_T;
	var k: usize = 0;

	// Vector load where possible
	while (k + 4 <= k_size) : (k += 4) {
	const vec = Vec4f{
	ctx.A[src_idx + k],
	ctx.A[src_idx + k + 1],
	ctx.A[src_idx + k + 2],
	ctx.A[src_idx + k + 3],
	};
	for (0..4) \|offset\| {
	A_local[local_idx + k + offset] = vec[offset];
	}
	}

	// Handle remaining elements
	while (k < k_size) : (k += 1) {
	A_local[local_idx + k] = ctx.A[src_idx + k];
	}
	}

	// Load B tile with NEON alignment
	for (0..k_size) \|k\| {
	const src_idx = (k_start + k) * ctx.N + j_start;
	const local_idx = k * ctx.aligned_T;
	var j: usize = 0;

	// Vector load where possible
	while (j + 4 <= j_size) : (j += 4) {
	const vec = Vec4f{
	ctx.B[src_idx + j],
	ctx.B[src_idx + j + 1],
	ctx.B[src_idx + j + 2],
	ctx.B[src_idx + j + 3],
	};
	for (0..4) \|offset\| {
	B_local[local_idx + j + offset] = vec[offset];
	}
	}

	// Handle remaining elements
	while (j < j_size) : (j += 1) {
	B_local[local_idx + j] = ctx.B[src_idx + j];
	}
	}

	// Process 4x4 blocks using NEON
	var i: usize = 0;
	while (i + 4 <= i_size) : (i += 4) {
	var j: usize = 0;
	while (j + 4 <= j_size) : (j += 4) {
	var acc: [4][4]f32 align(16) = [_][4]f32{[_]f32{0} 4} 4;

	for (0..k_size) \|k\| {
	const a_vec = Vec4f{
	A_local[(i + 0) * ctx.aligned_T + k],
	A_local[(i + 1) * ctx.aligned_T + k],
	A_local[(i + 2) * ctx.aligned_T + k],
	A_local[(i + 3) * ctx.aligned_T + k],
	};

	const b_vec = Vec4f{
	B_local[k * ctx.aligned_T + j],
	B_local[k * ctx.aligned_T + j + 1],
	B_local[k * ctx.aligned_T + j + 2],
	B_local[k * ctx.aligned_T + j + 3],
	};

	// NEON FMA operations
	inline for (0..4) \|bi\| {
	const a_broadcast: Vec4f = @splat(a_vec[bi]);
	const c_vec = Vec4f{
	acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
	};
	const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec);
	inline for (0..4) \|bj\| {
	acc[bi][bj] = prod[bj];
	}
	}
	}

	// Store accumulated results
	for (0..4) \|bi\| {
	for (0..4) \|bj\| {
	ctx.local_C[(i + bi) * ctx.aligned_T + (j + bj)] += acc[bi][bj];
	}
	}
	}

	// Handle remaining columns
	while (j < j_size) : (j += 1) {
	for (0..4) \|bi\| {
	var sum: f32 = 0;
	for (0..k_size) \|k\| {
	sum = @mulAdd(f32, A_local[(i + bi) * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
	}
	ctx.local_C[(i + bi) * ctx.aligned_T + j] += sum;
	}
	}
	}

	// Handle remaining rows
	while (i < i_size) : (i += 1) {
	for (0..j_size) \|j\| {
	var sum: f32 = 0;
	for (0..k_size) \|k\| {
	sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
	}
	ctx.local_C[i * ctx.aligned_T + j] += sum;
	}
	}
	}

	const CpuInfo = struct {
	total_cores: u32,
	simd_width: u32,
	supports_peak_calc: bool = false,
	max_freq: ?u64 = null,
	is_arm: bool,
	};

	fn getSIMDWidth() u32 {
	if (builtin.cpu.arch == .aarch64) {
	// ARM NEON typically works with 128-bit vectors (4 x float32)
	return 4;
	} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
	return 16;
	} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
	return 8;
	} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
	return 4;
	} else {
	return 1;
	}
	}

	const SearchRange = struct {
	t_min: usize,
	t_max: usize,
	t_step: usize,
	v_sizes: []const usize,
	};

	pub const SearchResult = struct {
	tile_size: usize,
	vector_size: usize,
	gflops: f64,
	};

	pub fn adaptiveGridSearch(
	allocator: std.mem.Allocator,
	M: usize,
	N: usize,
	K: usize,
	initial_range: SearchRange,
	refinement_levels: usize,
	) !SearchResult {
	var best_result = SearchResult{
	.tile_size = 0,
	.vector_size = 0,
	.gflops = 0,
	};

	var current_range = initial_range;

	for (0..refinement_levels) \|level\| {
	std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step });
	std.debug.print("----------------------------------------\n", .{});

	// Search current range
	var results = std.ArrayList(SearchResult).init(allocator);
	defer results.deinit();

	var t = current_range.t_min;
	while (t <= current_range.t_max) : (t += current_range.t_step) {
	// Skip invalid tile sizes
	if (t > M or t > N or t > K) continue;

	for (current_range.v_sizes) \|v\| {
	if (t % v != 0) continue; // Skip if tile size not divisible by vector size

	// Run multiple trials and take best
	var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K);
	for (0..2) \|_\| {
	const trial = try benchmarkMatMul(allocator, t, v, M, N, K);
	if (trial.gflops > best_trial.gflops) {
	best_trial.gflops = trial.gflops;
	}
	}

	try results.append(.{
	.tile_size = t,
	.vector_size = v,
	.gflops = best_trial.gflops,
	});

	std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops });
	}
	}

	// Find top performers
	const top_k = 3; // Number of top results to consider
	const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len));
	defer allocator.free(top_results);

	// Sort results by GFLOPS (descending)
	std.sort.pdq(SearchResult, results.items, {}, struct {
	fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
	return a.gflops > b.gflops;
	}
	}.lessThan);

	// Copy top results
	@memcpy(top_results, results.items[0..top_results.len]);

	// Update best overall result
	if (results.items[0].gflops > best_result.gflops) {
	best_result = results.items[0];
	}

	// Print top performers for this level
	std.debug.print("\nTop performers at level {d}:\n", .{level});
	for (top_results, 0..) \|result, i\| {
	std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops });
	}

	// Refine search range around best result
	if (level + 1 < refinement_levels) {
	const best_T = top_results[0].tile_size;
	const step = current_range.t_step / 2;
	current_range = SearchRange{
	.t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min,
	.t_max = @min(best_T + step * 2, current_range.t_max),
	.t_step = step,
	.v_sizes = current_range.v_sizes,
	};
	}
	}

	return best_result;
	}

	pub fn benchmarkMatMul(
	allocator: std.mem.Allocator,
	tile_size: usize,
	vector_size: usize,
	M: usize,
	N: usize,
	K: usize,
	) !SearchResult {
	const A = try allocator.alloc(f32, M * K);
	defer allocator.free(A);
	const B = try allocator.alloc(f32, K * N);
	defer allocator.free(B);
	const C = try allocator.alloc(f32, M * N);
	defer allocator.free(C);

	// Initialize matrices with random data
	var rng = std.rand.DefaultPrng.init(0);
	const random = rng.random();
	for (A) \|val\| val. = random.float(f32);
	for (B) \|val\| val. = random.float(f32);
	@memset(C, 0);

	// Warm up run
	try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);

	// Actual benchmark
	const start_time = time.nanoTimestamp();
	try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
	const end_time = time.nanoTimestamp();
	const elapsed_ns: u64 = @intCast(end_time - start_time);

	// Calculate GFLOPS
	const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add
	const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
	const gflops = operations / (seconds * 1e9);

	return SearchResult{
	.tile_size = tile_size,
	.vector_size = vector_size,
	.gflops = gflops,
	};
	}

	fn getCpuFrequencies() !CpuInfo {
	var info = CpuInfo{
	.simd_width = getSIMDWidth(),
	.total_cores = @intCast(try std.Thread.getCpuCount()),
	.is_arm = (builtin.cpu.arch == .aarch64),
	};

	if (builtin.os.tag == .macos) {
	// On MacOS, we'll use conservative defaults since we can't easily get CPU frequency
	if (info.is_arm) {
	info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2
	} else {
	info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel
	}
	info.supports_peak_calc = true;

	// Default frequencies if detection fails
	if (info.max_freq == null) {
	if (info.is_arm) {
	info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2
	} else {
	info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel
	}
	info.supports_peak_calc = true;
	std.debug.print("Warning: Could not detect CPU frequency. Using default for {s}\n", .{if (info.is_arm) "ARM" else "Intel"});
	}
	}

	return info;
	}

	pub fn main() !void {
	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
	defer arena.deinit();
	const allocator = arena.allocator();

	const sizes = [_][3]usize{
	.{ 1024, 1024, 1024 },
	// .{ 2048, 2048, 2048 },
	// .{ 4096, 4096, 4096 },
	};

	const iterations = 5;

	// Print system configuration
	const cpu_info = try getCpuFrequencies();
	std.debug.print("\nSystem Configuration:\n", .{});
	std.debug.print("----------------------------------------\n", .{});
	std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM" else "x86"});
	std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores});
	if (cpu_info.supports_peak_calc and cpu_info.max_freq != null) {
	std.debug.print("CPU Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.max_freq.?)) / 1e9});
	}
	std.debug.print("SIMD Width: {}\n", .{cpu_info.simd_width});
	std.debug.print("----------------------------------------\n\n", .{});

	// Modified search range for ARM
	const initial_range = SearchRange{
	.t_min = 16, // Smaller minimum tile size for ARM
	.t_max = 256, // Smaller maximum tile size due to different cache hierarchy
	.t_step = 32,
	.v_sizes = &[_]usize{4}, // Only use vector size 4 for NEON
	};

	for (sizes) \|size\| {
	const M = size[0];
	const N = size[1];
	const K = size[2];

	std.debug.print("\n========================================\n", .{});
	std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K });
	std.debug.print("========================================\n", .{});

	const best_config = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4);

	std.debug.print("\nBest configuration found:\n", .{});
	std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size});
	std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size});
	std.debug.print("Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops});
	}

	// Print summary
	std.debug.print("\n========================================\n", .{});
	std.debug.print("Benchmark Summary:\n", .{});
	std.debug.print("----------------------------------------\n", .{});
	std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
	std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM (NEON)" else "x86"});
	std.debug.print("Tree search levels: 4\n", .{});
	std.debug.print("Iterations: {d}\n", .{iterations});
	std.debug.print("========================================\n\n", .{});
	}