corporatepiyush · October 16, 2025 08:11
diff --git a/blas.dart b/blas.dart
 // blas.dart
 import 'dart:ffi' as ffi;
 import 'dart:typed_data';
 import 'package:ffi/ffi.dart'; // Need this package!

 // Define the cblas_sgemm signature
 typedef CblasSgemmNative =
    ffi.Void Function(
      ffi.Int32 Order,
      ffi.Int32 TransA,
      ffi.Int32 TransB,
      ffi.Int32 M,
      ffi.Int32 N,
      ffi.Int32 K,
      ffi.Float alpha,
      ffi.Pointer<ffi.Float> A,
      ffi.Int32 lda,
      ffi.Pointer<ffi.Float> B,
      ffi.Int32 ldb,
      ffi.Float beta,
      ffi.Pointer<ffi.Float> C,
      ffi.Int32 ldc,
    );

 typedef CblasSgemmDart =
    void Function(
      int Order,
      int TransA,
      int TransB,
      int M,
      int N,
      int K,
      double alpha,
      ffi.Pointer<ffi.Float> A,
      int lda,
      ffi.Pointer<ffi.Float> B,
      int ldb,
      double beta,
      ffi.Pointer<ffi.Float> C,
      int ldc,
    );

 void main() {
  // Load Accelerate framework
  final accelerate = ffi.DynamicLibrary.open(
    '/System/Library/Frameworks/Accelerate.framework/Accelerate',
  );

  // Lookup cblas_sgemm
  final cblas_sgemm = accelerate
      .lookup<ffi.NativeFunction<CblasSgemmNative>>('cblas_sgemm')
      .asFunction<CblasSgemmDart>();

  const n = 1024;
  const size = n * n;

  // Allocate memory using calloc (from package:ffi)
  final A = calloc<ffi.Float>(size);
  final B = calloc<ffi.Float>(size);
  final C = calloc<ffi.Float>(size);

  // Fill with random data
  for (int i = 0; i < size; i++) {
    A[i] = 0.5;
    B[i] = 0.5;
  }

  final stopwatch = Stopwatch()..start();

  cblas_sgemm(
    101, // CblasRowMajor
    111, // CblasNoTrans
    111, // CblasNoTrans
    n,
    n,
    n,
    1.0,
    A,
    n,
    B,
    n,
    0.0,
    C,
    n,
  );

  stopwatch.stop();
  print(
    'Dart + Accelerate: ${(stopwatch.elapsedMicroseconds / 1000000).toStringAsFixed(3)}s',
  );

  // Cleanup
  calloc.free(A);
  calloc.free(B);
  calloc.free(C);
 }
diff --git a/blas.go b/blas.go
 package main

 /*
 #cgo LDFLAGS: -framework Accelerate
 #include <Accelerate/Accelerate.h>
 */
 import "C"
 import (
 	"fmt"
 	"math/rand"
 	"time"
 	"unsafe"
 )

 func multiplyBLAS(A, B, C []float32, n int) {
 	// Convert Go slices to C pointers
 	pA := (*C.float)(unsafe.Pointer(&A[0]))
 	pB := (*C.float)(unsafe.Pointer(&B[0]))
 	pC := (*C.float)(unsafe.Pointer(&C[0]))

 	// Call cblas_sgemm from Accelerate
 	C.cblas_sgemm(
 		C.CblasRowMajor,
 		C.CblasNoTrans,
 		C.CblasNoTrans,
 		C.int(n),
 		C.int(n),
 		C.int(n),
 		1.0,
 		pA,
 		C.int(n),
 		pB,
 		C.int(n),
 		0.0,
 		pC,
 		C.int(n),
 	)
 }

 func main() {
 	n := 1024

 	A := make([]float32, n*n)
 	B := make([]float32, n*n)
 	C := make([]float32, n*n)

 	// Fill with random data
 	for i := range A {
 		A[i] = rand.Float32()
 		B[i] = rand.Float32()
 	}

 	start := time.Now()
 	multiplyBLAS(A, B, C, n)
 	elapsed := time.Since(start)

 	fmt.Printf("Go + Accelerate: %.3fs\n", elapsed.Seconds())
 }
diff --git a/blas.js b/blas.js
 import { dlopen, FFIType, suffix } from 'bun:ffi';

 // Load Accelerate framework
 const lib = dlopen('/System/Library/Frameworks/Accelerate.framework/Accelerate', {
  cblas_sgemm: {
    args: [
      FFIType.i32, // Order
      FFIType.i32, // TransA
      FFIType.i32, // TransB
      FFIType.i32, // M
      FFIType.i32, // N
      FFIType.i32, // K
      FFIType.f32, // alpha
      FFIType.ptr, // A
      FFIType.i32, // lda
      FFIType.ptr, // B
      FFIType.i32, // ldb
      FFIType.f32, // beta
      FFIType.ptr, // C
      FFIType.i32, // ldc
    ],
    returns: FFIType.void,
  },
 });

 function multiplyBLAS(n) {
  const size = n * n;

  // Allocate aligned memory
  const A = new Float32Array(size);
  const B = new Float32Array(size);
  const C = new Float32Array(size);

  // Fill with random data
  for (let i = 0; i < size; i++) {
    A[i] = Math.random();
    B[i] = Math.random();
  }

  const start = performance.now();

  lib.symbols.cblas_sgemm(
    101, // CblasRowMajor
    111, // CblasNoTrans
    111, // CblasNoTrans
    n,
    n,
    n,
    1.0,
    A,
    n,
    B,
    n,
    0.0,
    C,
    n,
  );

  const end = performance.now();
  console.log(`Bun + Accelerate: ${((end - start) / 1000).toFixed(3)}s`);
 }

 multiplyBLAS(1024);
diff --git a/Matmul.c b/Matmul.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <math.h> 
 // REQUIRED for the M1/Apple Silicon native BLAS implementation (Accelerate Framework)
 // gcc -O3 -Wall -Wextra Matmul.c -o Matmul -lm -framework Accelerate

 #include "cblas.h" 

 // --- Utility Functions ---

 /**
 * @brief Finds the minimum of two integers.
 */
 int min(int a, int b) {
    return (a < b) ? a : b;
 }

 /**
 * @brief Generates a flat, row-major N x M matrix with random floats (0.0 to 1.0).
 * @param rows The number of rows (N).
 * @param cols The number of columns (M).
 * @return A pointer to the allocated float array. Must be freed by the caller.
 */
 float *generate_random_matrix(int rows, int cols) {
    float *matrix = (float *)malloc(rows * cols * sizeof(float));
    if (matrix == NULL) {
        perror("Memory allocation failed for matrix");
        exit(EXIT_FAILURE);
    }
    
    for (int i = 0; i < rows * cols; i++) {
        // Generate random float between 0.0 and 1.0
        matrix[i] = (float)rand() / (float)RAND_MAX;
    }
    return matrix;
 }

 /**
 * @brief Creates the transpose of a matrix (B_T from B).
 * @param B The original K x M matrix.
 * @param K, M The dimensions of B.
 * @return A pointer to the allocated M x K transposed float array (B_T).
 */
 float *transpose_matrix(const float *B, int K, int M) {
    float *B_T = (float *)malloc(M * K * sizeof(float));
    if (B_T == NULL) {
        perror("Memory allocation failed for transposed matrix");
        exit(EXIT_FAILURE);
    }

    for (int k = 0; k < K; k++) {
        for (int m = 0; m < M; m++) {
            // B[k][m] goes to B_T[m][k]
            B_T[m * K + k] = B[k * M + m];
        }
    }
    return B_T;
 }


 // --- Matrix Multiplication Versions ---

 // Define the function pointer type for matrix multiplication.
 // Use '__restrict' keyword for all pointers to assist the M1's compiler (clang).
 typedef void (*mm_func)(const float *__restrict, const float *__restrict, float *__restrict, int, int, int, int);

 /**
 * @brief Version 1: Naive (ijk order) matrix multiplication.
 */
 void multiply_naive_ijk(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
    (void)tile_size; // Suppress unused parameter warning
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            float sum = 0.0f;
            for (int k = 0; k < K; k++) {
                // A[i][k] * B[k][j]. B access is column-major (cache-unfriendly)
                sum += A[i * K + k] * B[k * M + j];
            }
            C[i * M + j] = sum;
        }
    }
 }

 /**
 * @brief Version 2: Reordered (ikj order - cache friendly for A and C).
 */
 void multiply_reordered_ikj(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
    (void)tile_size; // Suppress unused parameter warning
    for (int i = 0; i < N; i++) {
        int c_row_offset = i * M;
        for (int k = 0; k < K; k++) {
            // Load A[i][k] once per inner loop
            float a_val = A[i * K + k];
            int b_row_offset = k * M;
            
            // Inner loop iterates over B row and C row (cache-friendly strides)
            for (int j = 0; j < M; j++) {
                C[c_row_offset + j] += a_val * B[b_row_offset + j];
            }
        }
    }
 }

 /**
 * @brief Version 3: Tiled/Blocked matrix multiplication (ikj order inside blocks).
 * Exploits L1/L2 cache locality with block size.
 * @param TILE_SIZE The size of the square tile to use.
 */
 void multiply_tiled(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int TILE_SIZE) {
    for (int i_tile = 0; i_tile < N; i_tile += TILE_SIZE) {
        for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE) {
            for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE) {
                
                int i_end = min(i_tile + TILE_SIZE, N);
                int j_end = min(j_tile + TILE_SIZE, M);
                int k_end = min(k_tile + TILE_SIZE, K);

                for (int i = i_tile; i < i_end; i++) {
                    int c_row_offset = i * M;
                    for (int k = k_tile; k < k_end; k++) {
                        float a_val = A[i * K + k];
                        int b_row_offset = k * M;
                        for (int j = j_tile; j < j_end; j++) {
                            C[c_row_offset + j] += a_val * B[b_row_offset + j];
                        }
                    }
                }
            }
        }
    }
 }

 /**
 * @brief Version 4: Transposed Dot Product (A * B_T). M1 Optimized.
 * This function expects B to be pre-transposed and K/M to be swapped for B_T.
 * @param A Matrix N x K.
 * @param B_T Matrix M x K (transposed B).
 * @param C Matrix N x M.
 * @param N, K, M Dimensions.
 */
 void multiply_transposed_dot(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) {
    (void)tile_size; // Suppress unused parameter warning
    // This turns the inner loop into a highly cache-friendly dot product of:
    // Row i of A (length K) and Row j of B_T (which is Column j of B, length K)
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            float sum = 0.0f;
            int a_offset = i * K;
            int b_t_offset = j * K;
            
            // Inner loop iterates over K (Dot Product) - all accesses are sequential
            for (int k = 0; k < K; k++) {
                // A[i][k] * B_T[j][k]
                sum += A[a_offset + k] * B_T[b_t_offset + k];
            }
            C[i * M + j] = sum;
        }
    }
 }

 /**
 * @brief Version 5: Transposed Dot Product with Loop Unrolling (Factor 4).
 * Unrolls the innermost dot product loop (k-loop) by a factor of 4 to reduce
 * loop overhead and potentially expose more ILP (Instruction-Level Parallelism).
 * This is the version you requested to include!
 */
 void multiply_transposed_unrolled(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) {
    (void)tile_size; // Suppress unused parameter warning
    
    // K_mod is the number of iterations that can be unrolled (must be a multiple of 4)
    int K_mod = K - (K % 4); 

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < M; j++) {
            float sum = 0.0f;
            int a_offset = i * K;
            int b_t_offset = j * K;
            
            int k = 0;
            
            // 1. Unrolled loop (k = 0 to K_mod - 1)
            for (; k < K_mod; k += 4) {
                // Calculate four multiplications and accumulations in one loop iteration
                sum += A[a_offset + k]     * B_T[b_t_offset + k];
                sum += A[a_offset + k + 1] * B_T[b_t_offset + k + 1];
                sum += A[a_offset + k + 2] * B_T[b_t_offset + k + 2];
                sum += A[a_offset + k + 3] * B_T[b_t_offset + k + 3];
            }
            
            // 2. Cleanup loop (k = K_mod to K - 1) for remaining elements
            for (; k < K; k++) {
                sum += A[a_offset + k] * B_T[b_t_offset + k];
            }
            
            C[i * M + j] = sum;
        }
    }
 }

 /**
 * @brief Version 6: Apple Accelerate/BLAS Optimized Matmul (cblas_sgemm).
 * This is the vendor-optimized method for peak performance on M1/M2/M3 chips.
 * This function performs C = alpha * A * B + beta * C.
 */
 void multiply_cblas_sgemm(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
    (void)tile_size; // Suppress unused parameter warning
    
    const float alpha = 1.0f; // Scale factor for A * B
    const float beta = 0.0f;  // Scale factor for C (0.0 since C is calloc'd)

    // CblasRowMajor: Matrices A, B, and C are stored in row-major order (C-style)
    // CblasNoTrans: Neither A nor B is transposed
    cblas_sgemm(CblasRowMajor, 
                CblasNoTrans, 
                CblasNoTrans,
                // M, N, K refer to the dimensions in the CblasRowMajor context:
                N,  // M: rows of A and C
                M,  // N: columns of B and C
                K,  // K: inner dimension (columns of A / rows of B)
                alpha, 
                A, 
                K,  // lda (leading dimension of A): K columns
                B, 
                M,  // ldb (leading dimension of B): M columns
                beta, 
                C, 
                M); // ldc (leading dimension of C): M columns
 }


 // --- Benchmark Runner ---

 /**
 * @brief Benchmarks a matrix multiplication function.
 * Runs 3 warmup iterations and 5 measured iterations, returning the best time.
 * @param name The name of the test.
 * @param func The matrix multiplication function to test.
 * @param A, B The input matrices.
 * @param N, K, M The dimensions.
 * @param tile_size The tile size to pass (used only by tiled version).
 * @return The best time measured in seconds.
 */
 double benchmark_runner(const char *name, mm_func func, const float *A, const float *B, int N, int K, int M, int tile_size) {
    
    // Silence 'unused parameter' warning, as printing is handled in main()
    (void)name; 
    
    // Warmup (3 times)
    for (int i = 0; i < 3; i++) {
        // calloc zeroes memory which is necessary for C += ... operations
        float *C_dummy = (float *)calloc(N * M, sizeof(float)); 
        if (C_dummy == NULL) {
             perror("Warmup memory allocation failed");
             exit(EXIT_FAILURE);
        }

        func(A, B, C_dummy, N, K, M, tile_size);
        
        free(C_dummy);
    }
    
    // Measure (take best of 5)
    double best_time = 0.0;
    
    for (int i = 0; i < 5; i++) {
        // Allocate and zero the result matrix C for each run
        float *C = (float *)calloc(N * M, sizeof(float));
         if (C == NULL) {
             perror("Measurement memory allocation failed");
             exit(EXIT_FAILURE);
        }
        
        // Use clock() for CPU time measurement
        clock_t start = clock();
        
        func(A, B, C, N, K, M, tile_size);
        
        clock_t end = clock();
        
        free(C);
        
        double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC;
        
        if (i == 0 || elapsed_sec < best_time) {
            best_time = elapsed_sec;
        }
    }
    
    return best_time;
 }

 // --- Main Program ---

 int main() {
    // Seed the random number generator once
    srand((unsigned int)time(NULL)); 

    printf("C17 Matrix Multiplication Benchmark (M1 Pro Optimized)\n");
    printf("====================================================\n\n");

    int sizes[] = {256, 512, 1024, 2048}; // Added 2048 to push the M1's cache limits
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
    int TILE_SIZE = 48; // Adjusted tile size to be slightly more aggressive for L1/L2 cache

    for (int s = 0; s < num_sizes; s++) {
        int size = sizes[s];
        
        // For square matrices, N=K=M=size
        int N = size;
        int K = size;
        int M = size;

        printf("======================================================================\n");
        printf("Matrix size: %d x %d (Total Data: %.1f MB)\n", size, size, (double)(3.0 * size * size * sizeof(float)) / (1024.0 * 1024.0));
        printf("======================================================================\n");

        // Generate matrices A and B
        float *A = generate_random_matrix(N, K);
        float *B = generate_random_matrix(K, M);
        
        // Allocate B_T once for transposed benchmarks
        printf("\nPreprocessing (Transpose B)... ");
        float *B_T = transpose_matrix(B, K, M); 
        printf("Done.\n");

        printf("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):\n\n");

        double reordered_time = 0.0;
        double current_time = 0.0;
        double transposed_time = 0.0;
        double unrolled_time = 0.0;
        double cblas_time = 0.0;


        // 1. Naive (ijk) - Skip for large matrices
        if (size <= 512) {
             current_time = benchmark_runner("1. Naive (ijk)", multiply_naive_ijk, A, B, N, K, M, 0);
             printf("1. Naive (ijk):                   %.3fs\n", current_time);
        } else {
             printf("1. Naive (ijk):                   [skipped - too slow]\n");
        }


        // 2. Reordered (ikj) - BASELINE
        reordered_time = benchmark_runner("2. Reordered (ikj)", multiply_reordered_ikj, A, B, N, K, M, 0);
        printf("2. Reordered (ikj):               %.3fs\n", reordered_time);

        // 3. Tiled/Blocked
        current_time = benchmark_runner("3. Tiled", multiply_tiled, A, B, N, K, M, TILE_SIZE);
        printf("3. Tiled (%d):                    %.3fs (%.2fx)\n",
            TILE_SIZE, current_time, reordered_time / current_time);
        
        // 4. Transposed Dot Product (using B_T)
        transposed_time = benchmark_runner("4. Transposed Dot Product", multiply_transposed_dot, A, B_T, N, K, M, 0);
        printf("4. Transposed Dot Product:        %.3fs (%.2fx)\n",
            transposed_time, reordered_time / transposed_time);
            
        // 5. Transposed Dot Product with Unrolling (using B_T)
        unrolled_time = benchmark_runner("5. Transposed Unrolled Dot", multiply_transposed_unrolled, A, B_T, N, K, M, 0);
        printf("5. Transposed Unrolled Dot (x4):  %.3fs (%.2fx)\n", 
            unrolled_time, reordered_time / unrolled_time);
            
        // 6. M1 Native BLAS (cblas_sgemm)
        cblas_time = benchmark_runner("6. M1 Native BLAS (cblas_sgemm)", multiply_cblas_sgemm, A, B, N, K, M, 0);
        printf("6. M1 Native BLAS (cblas_sgemm):  %.3fs (%.2fx) <--- NATIVE SPEED\n", 
            cblas_time, reordered_time / cblas_time);


        // Clean up memory for the current size
        free(A);
        free(B);
        free(B_T); // Free the transposed matrix
        
        printf("\n");
    }
    
    printf("====================================================\n");
    printf("Benchmark Complete.\n");

    return 0;
 }
diff --git a/Matmul.dart b/Matmul.dart
 import 'dart:math';
 import 'dart:typed_data';
 import 'dart:io';

 // A record to hold the name and time of a benchmark result.
 typedef BenchmarkResult = ({String name, double time});

 /// Generates a matrix of a given size with random double values.
 List<List<double>> randomMatrix(int rows, int cols) {
  final random = Random();
  return List.generate(
    rows,
    (_) => List.generate(cols, (_) => random.nextDouble()),
  );
 }

 /// Version 1: Naive (ijk order) matrix multiplication.
 List<List<double>> naiveMultiply(List<List<double>> A, List<List<double>> B) {
  final n = A.length;
  final k = A[0].length;
  final m = B[0].length;
  final C = List.generate(n, (_) => List.filled(m, 0.0));

  for (int i = 0; i < n; i++) {
    for (int j = 0; j < m; j++) {
      double sum = 0.0;
      for (int kIdx = 0; kIdx < k; kIdx++) {
        sum += A[i][kIdx] * B[kIdx][j];
      }
      C[i][j] = sum;
    }
  }
  return C;
 }

 /// Version 2: Reordered (ikj order) for better cache performance.
 List<List<double>> reorderedMultiply(
  List<List<double>> A,
  List<List<double>> B,
 ) {
  final n = A.length;
  final k = A[0].length;
  final m = B[0].length;
  final C = List.generate(n, (_) => List.filled(m, 0.0));

  for (int i = 0; i < n; i++) {
    for (int kIdx = 0; kIdx < k; kIdx++) {
      final aVal = A[i][kIdx];
      for (int j = 0; j < m; j++) {
        C[i][j] += aVal * B[kIdx][j];
      }
    }
  }
  return C;
 }

 /// Version 3: Tiled/Blocked matrix multiplication.
 List<List<double>> tiledMultiply(
  List<List<double>> A,
  List<List<double>> B,
  int tileSize,
 ) {
  final n = A.length;
  final k = A[0].length;
  final m = B[0].length;
  final C = List.generate(n, (_) => List.filled(m, 0.0));

  for (int iTile = 0; iTile < n; iTile += tileSize) {
    for (int jTile = 0; jTile < m; jTile += tileSize) {
      for (int kTile = 0; kTile < k; kTile += tileSize) {
        final iEnd = min(iTile + tileSize, n);
        final jEnd = min(jTile + tileSize, m);
        final kEnd = min(kTile + tileSize, k);

        for (int i = iTile; i < iEnd; i++) {
          for (int kIdx = kTile; kIdx < kEnd; kIdx++) {
            final aVal = A[i][kIdx];
            for (int j = jTile; j < jEnd; j++) {
              C[i][j] += aVal * B[kIdx][j];
            }
          }
        }
      }
    }
  }
  return C;
 }

 /// Version 4: Manual loop unrolling by a factor of 4.
 List<List<double>> unrolled4Multiply(
  List<List<double>> A,
  List<List<double>> B,
 ) {
  final n = A.length;
  final k = A[0].length;
  final m = B[0].length;
  final C = List.generate(n, (_) => List.filled(m, 0.0));

  for (int i = 0; i < n; i++) {
    for (int kIdx = 0; kIdx < k; kIdx++) {
      final aVal = A[i][kIdx];
      int j = 0;

      // Process 4 elements at a time
      for (; j <= m - 4; j += 4) {
        C[i][j] += aVal * B[kIdx][j];
        C[i][j + 1] += aVal * B[kIdx][j + 1];
        C[i][j + 2] += aVal * B[kIdx][j + 2];
        C[i][j + 3] += aVal * B[kIdx][j + 3];
      }

      // Handle the remainder
      for (; j < m; j++) {
        C[i][j] += aVal * B[kIdx][j];
      }
    }
  }
  return C;
 }

 /// Version 5: Uses flat Float32List to avoid 2D list overhead.
 Float32List flatArrayMultiply(
  Float32List A,
  Float32List B,
  int n,
  int k,
  int m,
 ) {
  final C = Float32List(n * m); // Automatically initialized to 0

  for (int i = 0; i < n; i++) {
    for (int kIdx = 0; kIdx < k; kIdx++) {
      final aVal = A[i * k + kIdx];
      final bRowOffset = kIdx * m;
      final cRowOffset = i * m;

      for (int j = 0; j < m; j++) {
        C[cRowOffset + j] += aVal * B[bRowOffset + j];
      }
    }
  }
  return C;
 }

 /// Helper to convert a 2D list to a flat Float32List.
 Float32List toFlatList(List<List<double>> matrix) {
  final rows = matrix.length;
  if (rows == 0) return Float32List(0);
  final cols = matrix[0].length;
  final flat = Float32List(rows * cols);

  for (int i = 0; i < rows; i++) {
    for (int j = 0; j < cols; j++) {
      flat[i * cols + j] = matrix[i][j];
    }
  }
  return flat;
 }

 /// Runs a function multiple times to allow the JIT compiler to warm up.
 void warmupFunction(String name, void Function() fn) {
  print('  Warming up: $name...');
  const iterations = 10;
  for (int i = 0; i < iterations; i++) {
    fn();
  }
 }

 /// Benchmarks a function by running it multiple times and taking the minimum time.
 BenchmarkResult benchmark(String name, Function fn) {
  const iterations = 5;
  final times = <double>[];
  final stopwatch = Stopwatch();

  for (int i = 0; i < iterations; i++) {
    stopwatch.start();
    fn();
    stopwatch.stop();
    // Convert microseconds to seconds
    times.add(stopwatch.elapsedMicroseconds / 1000000.0);
    stopwatch.reset();
  }

  // Take the minimum time (best run = least interference)
  final time = times.reduce(min);

  return (name: name, time: time);
 }

 /// Main test runner with proper warmup.
 void runTests() {
  print('Dart Matrix Multiplication - PROPER WARMUP VERSION\n');
  print('Runtime: Dart ${Platform.version.split(' ').first}');
  print('Platform: ${Platform.operatingSystem}\n');

  final sizes = [256, 512, 1024];

  for (final size in sizes) {
    print('='.padRight(70, '='));
    final mem = (3 * size * size * 4) / (1024 * 1024);
    print('Matrix size: ${size} x ${size} (${mem.toStringAsFixed(1)} MB)');
    print('='.padRight(70, '='));

    final A = randomMatrix(size, size);
    final B = randomMatrix(size, size);
    final A_flat = toFlatList(A);
    final B_flat = toFlatList(B);

    print('\n🔥 WARMUP PHASE (10 iterations each):');

    // Warmup all functions
    if (size <= 512) {
      warmupFunction('naiveMultiply', () => naiveMultiply(A, B));
    }
    warmupFunction('reorderedMultiply', () => reorderedMultiply(A, B));
    warmupFunction('tiledMultiply', () => tiledMultiply(A, B, 64));
    warmupFunction('unrolled4Multiply', () => unrolled4Multiply(A, B));
    warmupFunction(
      'flatArrayMultiply',
      () => flatArrayMultiply(A_flat, B_flat, size, size, size),
    );

    print('\n⚡ BENCHMARKING (5 runs, taking best):');

    final results = <BenchmarkResult>[];

    // Test 1: Naive
    if (size <= 512) {
      final r1 = benchmark('Naive (ijk)', () => naiveMultiply(A, B));
      results.add(r1);
      print('1. Naive (ijk):         ${r1.time.toStringAsFixed(3)}s');
    } else {
      print('1. Naive (ijk):         [skipped - too slow]');
    }

    // Test 2: Reordered (Baseline)
    final r2 = benchmark('Reordered (ikj)', () => reorderedMultiply(A, B));
    results.add(r2);
    print('2. Reordered (ikj):     ${r2.time.toStringAsFixed(3)}s');
    final baseTime = r2.time;

    // Test 3: Tiled
    const tileSize = 64;
    final r3 = benchmark(
      'Tiled (${tileSize}x$tileSize)',
      () => tiledMultiply(A, B, tileSize),
    );
    results.add(r3);
    print(
      '3. Tiled ($tileSize):         ${r3.time.toStringAsFixed(3)}s (${(baseTime / r3.time).toStringAsFixed(2)}x)',
    );

    // Test 4: Unrolled
    final r4 = benchmark('Unrolled 4x', () => unrolled4Multiply(A, B));
    results.add(r4);
    print(
      '4. Unrolled 4x:         ${r4.time.toStringAsFixed(3)}s (${(baseTime / r4.time).toStringAsFixed(2)}x)',
    );

    // Test 5: Flat arrays (TypedData)
    final r5 = benchmark(
      'Flat Float32List',
      () => flatArrayMultiply(A_flat, B_flat, size, size, size),
    );
    results.add(r5);
    print(
      '5. Flat Float32List:    ${r5.time.toStringAsFixed(3)}s (${(baseTime / r5.time).toStringAsFixed(2)}x)',
    );

    // Find the best performing version
    final best = results.reduce((a, b) => a.time < b.time ? a : b);
    final baseline = results.firstWhere((r) => r.name == 'Reordered (ikj)');

    print('\n✨ Winner: ${best.name} (${best.time.toStringAsFixed(3)}s)');
    print(
      '📊 Speedup vs Reordered: ${(baseline.time / best.time).toStringAsFixed(2)}x\n',
    );
  }
 }

 void main() {
  runTests();
 }
diff --git a/Matmul.go b/Matmul.go
 package main

 import (
 	"fmt"
 	"math/rand"
 	"strings"
 	"time"
 )

 // Generate random matrix
 func randomMatrix(rows, cols int) [][]float32 {
 	matrix := make([][]float32, rows)
 	for i := range matrix {
 		matrix[i] = make([]float32, cols)
 		for j := range matrix[i] {
 			matrix[i][j] = rand.Float32()
 		}
 	}
 	return matrix
 }

 // Version 1: Naive (ijk order)
 func naiveMultiply(A, B [][]float32) [][]float32 {
 	n := len(A)
 	k := len(A[0])
 	m := len(B[0])
 	C := make([][]float32, n)
 	for i := range C {
 		C[i] = make([]float32, m)
 	}

 	for i := 0; i < n; i++ {
 		for j := 0; j < m; j++ {
 			var sum float32
 			for kIdx := 0; kIdx < k; kIdx++ {
 				sum += A[i][kIdx] * B[kIdx][j]
 			}
 			C[i][j] = sum
 		}
 	}
 	return C
 }

 // Version 2: Reordered (ikj order - cache friendly)
 func reorderedMultiply(A, B [][]float32) [][]float32 {
 	n := len(A)
 	k := len(A[0])
 	m := len(B[0])
 	C := make([][]float32, n)
 	for i := range C {
 		C[i] = make([]float32, m)
 	}

 	for i := 0; i < n; i++ {
 		for kIdx := 0; kIdx < k; kIdx++ {
 			aVal := A[i][kIdx]
 			for j := 0; j < m; j++ {
 				C[i][j] += aVal * B[kIdx][j]
 			}
 		}
 	}
 	return C
 }

 // Version 3: Tiled/Blocked
 func tiledMultiply(A, B [][]float32, tileSize int) [][]float32 {
 	n := len(A)
 	k := len(A[0])
 	m := len(B[0])
 	C := make([][]float32, n)
 	for i := range C {
 		C[i] = make([]float32, m)
 	}

 	for iTile := 0; iTile < n; iTile += tileSize {
 		for jTile := 0; jTile < m; jTile += tileSize {
 			for kTile := 0; kTile < k; kTile += tileSize {

 				iEnd := min(iTile+tileSize, n)
 				jEnd := min(jTile+tileSize, m)
 				kEnd := min(kTile+tileSize, k)

 				for i := iTile; i < iEnd; i++ {
 					for kIdx := kTile; kIdx < kEnd; kIdx++ {
 						aVal := A[i][kIdx]
 						for j := jTile; j < jEnd; j++ {
 							C[i][j] += aVal * B[kIdx][j]
 						}
 					}
 				}
 			}
 		}
 	}
 	return C
 }

 // Version 4: Manual unroll 4x
 func unrolled4Multiply(A, B [][]float32) [][]float32 {
 	n := len(A)
 	k := len(A[0])
 	m := len(B[0])
 	C := make([][]float32, n)
 	for i := range C {
 		C[i] = make([]float32, m)
 	}

 	for i := 0; i < n; i++ {
 		for kIdx := 0; kIdx < k; kIdx++ {
 			aVal := A[i][kIdx]
 			j := 0

 			// Process 4 at a time
 			for ; j <= m-4; j += 4 {
 				C[i][j] += aVal * B[kIdx][j]
 				C[i][j+1] += aVal * B[kIdx][j+1]
 				C[i][j+2] += aVal * B[kIdx][j+2]
 				C[i][j+3] += aVal * B[kIdx][j+3]
 			}

 			// Handle remainder
 			for ; j < m; j++ {
 				C[i][j] += aVal * B[kIdx][j]
 			}
 		}
 	}
 	return C
 }

 // Version 5: Flat slice (single allocation)
 func flatSliceMultiply(A, B []float32, n, k, m int) []float32 {
 	C := make([]float32, n*m)

 	for i := 0; i < n; i++ {
 		for kIdx := 0; kIdx < k; kIdx++ {
 			aVal := A[i*k+kIdx]
 			bRowOffset := kIdx * m
 			cRowOffset := i * m

 			for j := 0; j < m; j++ {
 				C[cRowOffset+j] += aVal * B[bRowOffset+j]
 			}
 		}
 	}
 	return C
 }

 // Version 6: Flat slice with unrolling
 func flatSliceUnrolled(A, B []float32, n, k, m int) []float32 {
 	C := make([]float32, n*m)

 	for i := 0; i < n; i++ {
 		for kIdx := 0; kIdx < k; kIdx++ {
 			aVal := A[i*k+kIdx]
 			bRowOffset := kIdx * m
 			cRowOffset := i * m

 			j := 0
 			// Unroll by 8
 			for ; j <= m-8; j += 8 {
 				C[cRowOffset+j] += aVal * B[bRowOffset+j]
 				C[cRowOffset+j+1] += aVal * B[bRowOffset+j+1]
 				C[cRowOffset+j+2] += aVal * B[bRowOffset+j+2]
 				C[cRowOffset+j+3] += aVal * B[bRowOffset+j+3]
 				C[cRowOffset+j+4] += aVal * B[bRowOffset+j+4]
 				C[cRowOffset+j+5] += aVal * B[bRowOffset+j+5]
 				C[cRowOffset+j+6] += aVal * B[bRowOffset+j+6]
 				C[cRowOffset+j+7] += aVal * B[bRowOffset+j+7]
 			}
 			// Handle remainder
 			for ; j < m; j++ {
 				C[cRowOffset+j] += aVal * B[bRowOffset+j]
 			}
 		}
 	}
 	return C
 }

 // Helper to convert 2D to flat
 func toFlat(matrix [][]float32) []float32 {
 	rows := len(matrix)
 	cols := len(matrix[0])
 	flat := make([]float32, rows*cols)

 	for i := 0; i < rows; i++ {
 		for j := 0; j < cols; j++ {
 			flat[i*cols+j] = matrix[i][j]
 		}
 	}
 	return flat
 }

 // Benchmark helper that returns time
 func benchmark(name string, fn func()) time.Duration {
 	// Warmup
 	for i := 0; i < 3; i++ {
 		fn()
 	}

 	// Measure (take best of 5)
 	var bestTime time.Duration
 	for i := 0; i < 5; i++ {
 		start := time.Now()
 		fn()
 		elapsed := time.Since(start)
 		if bestTime == 0 || elapsed < bestTime {
 			bestTime = elapsed
 		}
 	}

 	return bestTime
 }

 func main() {
 	fmt.Println("Go 1.25 Matrix Multiplication Benchmark")
 	fmt.Println("========================================\n")

 	sizes := []int{256, 512, 1024}

 	for _, size := range sizes {
 		fmt.Println(strings.Repeat("=", 70))
 		fmt.Printf("Matrix size: %d x %d (%.1f MB)\n", size, size, float64(3*size*size*4)/(1024*1024))
 		fmt.Println(strings.Repeat("=", 70))

 		A := randomMatrix(size, size)
 		B := randomMatrix(size, size)
 		AFlat := toFlat(A)
 		BFlat := toFlat(B)

 		fmt.Println("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):")
 		fmt.Println()

 		// Test naive (skip for large)
 		if size <= 512 {
 			t := benchmark("1. Naive (ijk)", func() {
 				_ = naiveMultiply(A, B)
 			})
 			fmt.Printf("1. Naive (ijk):                %.3fs\n", t.Seconds())
 		} else {
 			fmt.Println("1. Naive (ijk):                [skipped - too slow]")
 		}

 		// Test reordered (baseline)
 		reorderedTime := benchmark("2. Reordered (ikj)", func() {
 			_ = reorderedMultiply(A, B)
 		})
 		fmt.Printf("2. Reordered (ikj):            %.3fs\n", reorderedTime.Seconds())

 		// Test tiled
 		tileSize := 64
 		tiledTime := benchmark(fmt.Sprintf("3. Tiled (%d)", tileSize), func() {
 			_ = tiledMultiply(A, B, tileSize)
 		})
 		fmt.Printf("3. Tiled (%d):                  %.3fs (%.2fx)\n",
 			tileSize, tiledTime.Seconds(), reorderedTime.Seconds()/tiledTime.Seconds())

 		// Test unrolled 4x
 		unrolled4Time := benchmark("4. Unrolled 4x (2D slices)", func() {
 			_ = unrolled4Multiply(A, B)
 		})
 		fmt.Printf("4. Unrolled 4x (2D slices):    %.3fs (%.2fx)\n",
 			unrolled4Time.Seconds(), reorderedTime.Seconds()/unrolled4Time.Seconds())

 		// Test flat slice
 		flatTime := benchmark("5. Flat slice", func() {
 			_ = flatSliceMultiply(AFlat, BFlat, size, size, size)
 		})
 		fmt.Printf("5. Flat slice:                 %.3fs (%.2fx)\n",
 			flatTime.Seconds(), reorderedTime.Seconds()/flatTime.Seconds())

 		// Test flat slice unrolled
 		flatUnrolledTime := benchmark("6. Flat slice + unroll 8x", func() {
 			_ = flatSliceUnrolled(AFlat, BFlat, size, size, size)
 		})
 		fmt.Printf("6. Flat slice + unroll 8x:     %.3fs (%.2fx)\n",
 			flatUnrolledTime.Seconds(), reorderedTime.Seconds()/flatUnrolledTime.Seconds())

 		// Find winner
 		times := map[string]time.Duration{
 			"Reordered":        reorderedTime,
 			"Tiled":            tiledTime,
 			"Unrolled 4x":      unrolled4Time,
 			"Flat slice":       flatTime,
 			"Flat + unroll 8x": flatUnrolledTime,
 		}

 		var bestName string
 		var bestTime time.Duration
 		for name, t := range times {
 			if bestTime == 0 || t < bestTime {
 				bestTime = t
 				bestName = name
 			}
 		}

 		fmt.Printf("\n✨ Winner: %s (%.3fs)\n", bestName, bestTime.Seconds())
 		fmt.Printf("📊 Speedup vs Reordered: %.2fx\n\n", reorderedTime.Seconds()/bestTime.Seconds())
 	}
 }
diff --git a/Matmul.java b/Matmul.java
 public class Matmul {

  public static void main(String[] args) {
    System.out.println("Matrix Multiplication - Optimized for M1 (128-byte cache lines)\n");

    // Cache line holds 128 bytes / 4 bytes per float = 32 floats
    final int FLOATS_PER_CACHE_LINE = 32;

    System.out.println("M1 Architecture:");
    System.out.println("  Cache line size: 128 bytes");
    System.out.println("  Floats per cache line: " + FLOATS_PER_CACHE_LINE);
    System.out.println();

    int[] sizes = { 512, 1024, 2048 };

    for (int size : sizes) {
      System.out.println("=".repeat(70));
      System.out.printf("Matrix size: %d x %d (%.1f MB total)\n", size, size, (3.0 * size * size * 4) / (1024 * 1024));
      System.out.println("=".repeat(70));

      float[][] A = randomMatrix(size, size);
      float[][] B = randomMatrix(size, size);

      // 1. Naive (skip for large matrices)
      if (size <= 512) {
        long start = System.nanoTime();
        float[][] C1 = naiveMultiply(A, B);
        double naiveTime = (System.nanoTime() - start) / 1e9;
        System.out.printf("1. Naive:                  %.3f seconds (baseline)\n", naiveTime);
      } else {
        System.out.println("1. Naive:                  [skipped - too slow!]");
      }

      // 2. Reordered
      long start = System.nanoTime();
      float[][] C2 = reorderedMultiply(A, B);
      double reorderedTime = (System.nanoTime() - start) / 1e9;
      System.out.printf("2. Reordered:              %.3f seconds\n", reorderedTime);

      // 3. Cache-line aligned (process 32 floats at a time)
      start = System.nanoTime();
      float[][] C3 = cacheLineAligned(A, B);
      double alignedTime = (System.nanoTime() - start) / 1e9;
      System.out.printf("3. Cache-line aligned:     %.3f seconds (%.2fx vs reordered)\n", alignedTime, reorderedTime / alignedTime);

      // 4. Try different tile sizes (M1 has huge caches!)
      System.out.println("4. Tiled versions:");
      int[] tileSizes = { 64, 128, 256, 512 };
      double bestTime = Double.MAX_VALUE;
      int bestTile = 0;

      for (int tileSize : tileSizes) {
        start = System.nanoTime();
        float[][] C4 = tiledMultiply(A, B, tileSize);
        double tiledTime = (System.nanoTime() - start) / 1e9;
        System.out.printf("   Tile size %4d:         %.3f seconds (%.2fx vs reordered)\n", tileSize, tiledTime, reorderedTime / tiledTime);

        if (tiledTime < bestTime) {
          bestTime = tiledTime;
          bestTile = tileSize;
        }
      }

      System.out.printf("\n✨ Best configuration: Tile size %d (%.2fx faster than reordered)\n\n", bestTile, reorderedTime / bestTime);
    }
  }

  // NAIVE
  static float[][] naiveMultiply(float[][] A, float[][] B) {
    int n = A.length,
      k = A[0].length,
      m = B[0].length;
    float[][] C = new float[n][m];
    for (int row = 0; row < n; row++) {
      for (int col = 0; col < m; col++) {
        for (int i = 0; i < k; i++) {
          C[row][col] += A[row][i] * B[i][col];
        }
      }
    }
    return C;
  }

  // REORDERED
  static float[][] reorderedMultiply(float[][] A, float[][] B) {
    int n = A.length,
      k = A[0].length,
      m = B[0].length;
    float[][] C = new float[n][m];
    for (int row = 0; row < n; row++) {
      for (int i = 0; i < k; i++) {
        float aValue = A[row][i];
        for (int col = 0; col < m; col++) {
          C[row][col] += aValue * B[i][col];
        }
      }
    }
    return C;
  }

  // CACHE-LINE ALIGNED: Process 32 floats at once (128 bytes)
  static float[][] cacheLineAligned(float[][] A, float[][] B) {
    int n = A.length,
      k = A[0].length,
      m = B[0].length;
    float[][] C = new float[n][m];

    final int BLOCK = 32; // 32 floats = 128 bytes = 1 cache line on M1

    for (int row = 0; row < n; row++) {
      for (int i = 0; i < k; i++) {
        float aValue = A[row][i];

        // Process 32 columns at a time (1 cache line)
        int col = 0;
        for (; col <= m - BLOCK; col += BLOCK) {
          // These 32 accesses hit the same cache line!
          for (int j = 0; j < BLOCK; j++) {
            C[row][col + j] += aValue * B[i][col + j];
          }
        }

        // Handle remaining columns
        for (; col < m; col++) {
          C[row][col] += aValue * B[i][col];
        }
      }
    }
    return C;
  }

  // TILED
  static float[][] tiledMultiply(float[][] A, float[][] B, int tileSize) {
    int n = A.length,
      k = A[0].length,
      m = B[0].length;
    float[][] C = new float[n][m];

    for (int rowTile = 0; rowTile < n; rowTile += tileSize) {
      for (int colTile = 0; colTile < m; colTile += tileSize) {
        for (int innerTile = 0; innerTile < k; innerTile += tileSize) {
          int rowEnd = Math.min(rowTile + tileSize, n);
          int colEnd = Math.min(colTile + tileSize, m);
          int innerEnd = Math.min(innerTile + tileSize, k);

          for (int row = rowTile; row < rowEnd; row++) {
            for (int i = innerTile; i < innerEnd; i++) {
              float aValue = A[row][i];
              for (int col = colTile; col < colEnd; col++) {
                C[row][col] += aValue * B[i][col];
              }
            }
          }
        }
      }
    }
    return C;
  }

  static float[][] randomMatrix(int rows, int cols) {
    float[][] matrix = new float[rows][cols];
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {
        matrix[i][j] = (float) Math.random();
      }
    }
    return matrix;
  }
 }
diff --git a/Matmul.js b/Matmul.js
 // Matrix Multiplication with PROPER WARMUP

 function randomMatrix(rows, cols) {
  const matrix = [];
  for (let i = 0; i < rows; i++) {
    matrix[i] = [];
    for (let j = 0; j < cols; j++) {
      matrix[i][j] = Math.random();
    }
  }
  return matrix;
 }

 // Version 1: Naive (ijk order)
 function naiveMultiply(A, B) {
  const n = A.length;
  const k = A[0].length;
  const m = B[0].length;
  const C = Array(n)
    .fill(0)
    .map(() => Array(m).fill(0));

  for (let i = 0; i < n; i++) {
    for (let j = 0; j < m; j++) {
      let sum = 0;
      for (let k_idx = 0; k_idx < k; k_idx++) {
        sum += A[i][k_idx] * B[k_idx][j];
      }
      C[i][j] = sum;
    }
  }
  return C;
 }

 // Version 2: Reordered (ikj order - cache friendly)
 function reorderedMultiply(A, B) {
  const n = A.length;
  const k = A[0].length;
  const m = B[0].length;
  const C = Array(n)
    .fill(0)
    .map(() => Array(m).fill(0));

  for (let i = 0; i < n; i++) {
    for (let k_idx = 0; k_idx < k; k_idx++) {
      const aVal = A[i][k_idx];
      for (let j = 0; j < m; j++) {
        C[i][j] += aVal * B[k_idx][j];
      }
    }
  }
  return C;
 }

 // Version 3: Tiled/Blocked
 function tiledMultiply(A, B, tileSize) {
  const n = A.length;
  const k = A[0].length;
  const m = B[0].length;
  const C = Array(n)
    .fill(0)
    .map(() => Array(m).fill(0));

  for (let iTile = 0; iTile < n; iTile += tileSize) {
    for (let jTile = 0; jTile < m; jTile += tileSize) {
      for (let kTile = 0; kTile < k; kTile += tileSize) {
        const iEnd = Math.min(iTile + tileSize, n);
        const jEnd = Math.min(jTile + tileSize, m);
        const kEnd = Math.min(kTile + tileSize, k);

        for (let i = iTile; i < iEnd; i++) {
          for (let k_idx = kTile; k_idx < kEnd; k_idx++) {
            const aVal = A[i][k_idx];
            for (let j = jTile; j < jEnd; j++) {
              C[i][j] += aVal * B[k_idx][j];
            }
          }
        }
      }
    }
  }
  return C;
 }

 // Version 4: Manual unroll 4x
 function unrolled4Multiply(A, B) {
  const n = A.length;
  const k = A[0].length;
  const m = B[0].length;
  const C = Array(n)
    .fill(0)
    .map(() => Array(m).fill(0));

  for (let i = 0; i < n; i++) {
    for (let k_idx = 0; k_idx < k; k_idx++) {
      const aVal = A[i][k_idx];
      let j = 0;

      // Process 4 at a time
      for (; j <= m - 4; j += 4) {
        C[i][j] += aVal * B[k_idx][j];
        C[i][j + 1] += aVal * B[k_idx][j + 1];
        C[i][j + 2] += aVal * B[k_idx][j + 2];
        C[i][j + 3] += aVal * B[k_idx][j + 3];
      }

      // Handle remainder
      for (; j < m; j++) {
        C[i][j] += aVal * B[k_idx][j];
      }
    }
  }
  return C;
 }

 // Version 5: Flat arrays (avoid 2D array overhead)
 function flatArrayMultiply(A, B, n, k, m) {
  const C = new Float32Array(n * m);

  for (let i = 0; i < n; i++) {
    for (let k_idx = 0; k_idx < k; k_idx++) {
      const aVal = A[i * k + k_idx];
      const bRowOffset = k_idx * m;
      const cRowOffset = i * m;

      for (let j = 0; j < m; j++) {
        C[cRowOffset + j] += aVal * B[bRowOffset + j];
      }
    }
  }
  return C;
 }

 // Helper to convert 2D to flat
 function toFlatArray(matrix) {
  const rows = matrix.length;
  const cols = matrix[0].length;
  const flat = new Float32Array(rows * cols);

  for (let i = 0; i < rows; i++) {
    for (let j = 0; j < cols; j++) {
      flat[i * cols + j] = matrix[i][j];
    }
  }
  return flat;
 }

 // PROPER warmup function
 function warmupFunction(fn, ...args) {
  console.log(`  Warming up: ${fn.name}...`);
  const iterations = 10;
  for (let i = 0; i < iterations; i++) {
    fn(...args);
  }
 }

 // Benchmark function with minimal overhead
 function benchmark(name, fn, ...args) {
  const iterations = 5;
  const times = [];

  for (let i = 0; i < iterations; i++) {
    const start = performance.now();
    const result = fn(...args);
    const end = performance.now();
    times.push((end - start) / 1000);
  }

  // Take the minimum time (best run = least interference)
  const time = Math.min(...times);

  return { name, time };
 }

 // Main test with proper warmup
 function runTests() {
  console.log('JavaScript Matrix Multiplication - PROPER WARMUP VERSION\n');
  console.log(`Runtime: ${typeof process !== 'undefined' ? 'Node.js ' + process.version : 'Browser'}`);
  console.log(`Platform: ${typeof process !== 'undefined' ? process.platform + ' ' + process.arch : 'Browser'}\n`);

  const sizes = [256, 512, 1024];

  for (const size of sizes) {
    console.log('='.repeat(70));
    console.log(`Matrix size: ${size} x ${size} (${((3 * size * size * 4) / (1024 * 1024)).toFixed(1)} MB)`);
    console.log('='.repeat(70));

    const A = randomMatrix(size, size);
    const B = randomMatrix(size, size);
    const A_flat = toFlatArray(A);
    const B_flat = toFlatArray(B);

    console.log('\n🔥 WARMUP PHASE (10 iterations each):');

    // Warmup all functions
    if (size <= 512) {
      warmupFunction(naiveMultiply, A, B);
    }
    warmupFunction(reorderedMultiply, A, B);
    warmupFunction(tiledMultiply, A, B, 64);
    warmupFunction(unrolled4Multiply, A, B);
    warmupFunction(flatArrayMultiply, A_flat, B_flat, size, size, size);

    console.log('\n⚡ BENCHMARKING (5 runs, taking best):');

    const results = [];

    // Test 1: Naive
    if (size <= 512) {
      const r1 = benchmark('Naive (ijk)', naiveMultiply, A, B);
      results.push(r1);
      console.log(`1. Naive (ijk):           ${r1.time.toFixed(3)}s`);
    } else {
      console.log(`1. Naive (ijk):           [skipped - too slow]`);
    }

    // Test 2: Reordered
    const r2 = benchmark('Reordered (ikj)', reorderedMultiply, A, B);
    results.push(r2);
    console.log(`2. Reordered (ikj):       ${r2.time.toFixed(3)}s`);
    const baseTime = r2.time;

    // Test 3: Tiled
    const tileSize = 64;
    const r3 = benchmark(`Tiled (${tileSize}x${tileSize})`, tiledMultiply, A, B, tileSize);
    results.push(r3);
    console.log(`3. Tiled (${tileSize}):            ${r3.time.toFixed(3)}s (${(baseTime / r3.time).toFixed(2)}x)`);

    // Test 4: Unrolled
    const r4 = benchmark('Unrolled 4x', unrolled4Multiply, A, B);
    results.push(r4);
    console.log(`4. Unrolled 4x:           ${r4.time.toFixed(3)}s (${(baseTime / r4.time).toFixed(2)}x)`);

    // Test 5: Flat arrays (TypedArray)
    const r5 = benchmark('Flat Float32Array', flatArrayMultiply, A_flat, B_flat, size, size, size);
    results.push(r5);
    console.log(`5. Flat Float32Array:     ${r5.time.toFixed(3)}s (${(baseTime / r5.time).toFixed(2)}x)`);

    // Find best
    const best = results.reduce((a, b) => (a.time < b.time ? a : b));
    const baseline = results.find(r => r.name === 'Reordered (ikj)');

    console.log(`\n✨ Winner: ${best.name} (${best.time.toFixed(3)}s)`);
    console.log(`📊 Speedup vs Reordered: ${(baseline.time / best.time).toFixed(2)}x\n`);
  }
 }

 // Run the tests
 runTests();
diff --git a/MatmulMultithread.c b/MatmulMultithread.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <math.h>
 #include <pthread.h>
 #include <stdbool.h>
 #include <string.h>
 #include <stdatomic.h> // Minimal synchronization: Atomic counter for job completion
 #include <sched.h> // For sched_yield()
 #include "cblas.h"

 // Check for NEON SIMD
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #define USE_NEON_SIMD 1
 #else
 #define USE_NEON_SIMD 0
 #endif

 // --- Constants and Global Settings ---
 #define TILE_SIZE 64
 #define STRASSEN_CUTOFF 64
 #define MAX_THREADS 8
 #define VECTOR_LANE_COUNT 4 // Floats per NEON register

 // --- Type Definitions ---
 // Function signature for the execution block of work (used by workers)
 typedef void (*mm_block_func)(const float *A, const float *B, float *C, int N, int K, int M, int row_start, int row_end, int tile_size);
 // Function signature for the top-level benchmark runner (used by main)
 typedef void (*mm_func)(const float *__restrict, const float *__restrict, float *__restrict, int, int, int, int, const float *__restrict);

 /**
 * @brief Arguments passed to each persistent worker thread.
 */
 typedef struct MatmulTaskArgs {
    int thread_id;
    const float *A;
    const float *B;
    float *C;
    int N, K, M;
    int row_start;
    int row_end;
    int tile_size;
    mm_block_func base_func;
 } MatmulTaskArgs;


 // --- Global Thread Pool (Minimal Synchronization using Atomicity) ---

 pthread_t g_workers[MAX_THREADS];
 MatmulTaskArgs g_args[MAX_THREADS];

 // Shared state for all workers to check for a new job or shutdown.
 atomic_bool g_running;          // Set to false for shutdown
 atomic_int g_workers_ready;     // Initial count: workers are ready to enter the job loop
 atomic_int g_workers_completed; // Counts workers finished the current job
 atomic_int g_job_counter;       // THE FIX: Incremented by main thread to signal a NEW job

 int g_num_workers = 0; // Actual number of threads created

 // --- Utility Functions (Declarations) ---
 int min(int a, int b);
 float *generate_random_matrix(int rows, int cols);
 float *transpose_matrix(const float *B, int K, int M);
 double benchmark_runner(const char *name, mm_func mm_function, const float *A, const float *B, const float *B_T, int N, int K, int M, int tile_size);
 void multiply_naive_ijk_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
 void multiply_reordered_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
 void multiply_tiled_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int TILE_SIZE_ARG);
 void multiply_optimized_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
 void multiply_simd_transposed_block(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);

 // Sequential wrappers (Declarations)
 void multiply_v2_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T);
 void multiply_v3_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T);
 void multiply_v11_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T);
 void multiply_cblas_sgemm_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T);
 void multiply_strassen_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T);


 // --- Thread Pool Worker Function ---

 /**
 * @brief The persistent worker thread loop. Waits for g_job_counter to increment.
 */
 void* thread_pool_worker(void *arg) {
    MatmulTaskArgs *w_args = (MatmulTaskArgs *)arg;
    int local_job_id = 0; // Tracks the last job ID this worker processed

    // Initial signal: Tell the main thread this worker is ready to wait.
    atomic_fetch_add(&g_workers_ready, 1);

    while (atomic_load(&g_running)) {

        // --- 1. WAIT FOR NEW JOB SIGNAL ---
        // Wait until the global job counter is higher than the last job we processed.
        while (local_job_id == atomic_load(&g_job_counter) && atomic_load(&g_running)) {
            sched_yield(); // Yield CPU time
        }

        if (!atomic_load(&g_running)) break;

        // Get the new job ID BEFORE executing the task
        local_job_id = atomic_load(&g_job_counter);

        // --- 2. EXECUTE TASK (Arguments are guaranteed to be updated by main thread) ---
        if (w_args->base_func) {
            w_args->base_func(w_args->A, w_args->B, w_args->C, w_args->N, w_args->K, w_args->M,
                              w_args->row_start, w_args->row_end, w_args->tile_size);
        }

        // --- 3. SIGNAL COMPLETION ---
        atomic_fetch_add(&g_workers_completed, 1);
    }

    return NULL;
 }


 // --- Thread Pool Management Functions ---

 /**
 * @brief Initializes the thread pool: creates and starts all worker threads.
 */
 void initialize_thread_pool(int N) {

    g_num_workers = min(N, MAX_THREADS);

    // Initialize atomic variables
    atomic_store(&g_running, true);
    atomic_init(&g_workers_ready, 0);
    atomic_init(&g_workers_completed, 0);
    atomic_init(&g_job_counter, 0);

    for (int i = 0; i < g_num_workers; i++) {
        g_args[i].thread_id = i;
        g_args[i].base_func = NULL;

        if (pthread_create(&g_workers[i], NULL, thread_pool_worker, &g_args[i]) != 0) {
            perror("Failed to create worker thread");
            g_num_workers = i;
            break;
        }
    }

    // Wait for all workers to reach their initial waiting state
    while (atomic_load(&g_workers_ready) < g_num_workers) {
        sched_yield();
    }
 }

 /**
 * @brief Cleans up the thread pool: signals shutdown and joins all workers.
 */
 void cleanup_thread_pool() {

    // Signal all workers to shut down
    atomic_store(&g_running, false);
    // Increment job counter one last time to wake any sleeping threads so they can read the shutdown signal
    atomic_fetch_add(&g_job_counter, 1);

    for (int i = 0; i < g_num_workers; i++) {
        pthread_join(g_workers[i], NULL);
    }

    g_num_workers = 0;
 }


 /**
 * @brief Dispatches a job to the cached thread pool and waits for completion.
 */
 void dispatch_job(mm_block_func block_func, const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {

    if (g_num_workers == 0) return;

    // --- 1. SETUP JOB & WORK PARTITIONS ---
    int current_row = 0;
    int rows_per_thread = N / g_num_workers;
    int remaining_rows = N % g_num_workers;

    for (int i = 0; i < g_num_workers; i++) {
        MatmulTaskArgs *args = &g_args[i];

        int row_start = current_row;
        int rows_to_process = rows_per_thread + (i < remaining_rows ? 1 : 0);
        int row_end = row_start + rows_to_process;

        // Update the arguments for the running worker threads
        args->A = A;
        args->B = B;
        args->C = C;
        args->N = N;
        args->K = K;
        args->M = M;
        args->row_start = row_start;
        args->row_end = row_end;
        args->tile_size = tile_size;
        args->base_func = block_func;

        current_row = row_end;
    }

    // --- 2. SIGNAL WORK START ---
    // Reset the completion counter BEFORE signaling the workers
    atomic_store(&g_workers_completed, 0);

    // Increment the job counter. This is the atomic signal that wakes the workers.
    atomic_fetch_add(&g_job_counter, 1);

    // --- 3. WAIT FOR COMPLETION ---
    // Wait until the completion counter reaches the total worker count
    while (atomic_load(&g_workers_completed) < g_num_workers) {
        sched_yield();
    }
 }


 // --- Wrapper Functions (Warnings suppressed) ---

 void shared_parallel_wrapper(mm_block_func block_func, const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)B_T;
    dispatch_job(block_func, A, B, C, N, K, M, tile_size);
 }

 void multiply_v1_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size;
    shared_parallel_wrapper(multiply_naive_ijk_block, A, B, C, N, K, M, 0, B_T);
 }
 void multiply_v2_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size;
    shared_parallel_wrapper(multiply_reordered_ikj_block, A, B, C, N, K, M, 0, B_T);
 }
 void multiply_v3_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    shared_parallel_wrapper(multiply_tiled_block, A, B, C, N, K, M, tile_size, B_T);
 }
 void multiply_v7_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size;
    shared_parallel_wrapper(multiply_optimized_ikj_block, A, B, C, N, K, M, 0, B_T);
 }
 void multiply_v11_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)B;
    (void)tile_size;
    // Pass B_T as the B argument for the transposed block function
    dispatch_job(multiply_simd_transposed_block, A, B_T, C, N, K, M, 0);
 }

 /**
 * @brief Wrapper for OpenBLAS cblas_sgemm (V9 in the overall set).
 */
 void multiply_cblas_sgemm_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size; (void)B_T;
    // Set to RowMajor order: C(N, M) = A(N, K) * B(K, M)
    // Transpose operations are handled internally by cblas_sgemm,
    // we assume standard row-major storage (CblasRowMajor).
    cblas_sgemm(CblasRowMajor, // Layout
                CblasNoTrans,  // Transpose A? No.
                CblasNoTrans,  // Transpose B? No.
                N,             // Number of rows of A and C (N)
                M,             // Number of columns of B and C (M)
                K,             // Number of columns of A and rows of B (K)
                1.0f,          // Alpha (Scaling factor for A * B)
                A,             // Matrix A
                K,             // Leading dimension of A (K)
                B,             // Matrix B
                M,             // Leading dimension of B (M)
                0.0f,          // Beta (Scaling factor for C)
                C,             // Matrix C
                M);            // Leading dimension of C (M)
 }


 // --- Main Program (Driver) ---

 int main() {
    srand((unsigned int)time(NULL));

    printf("C17 Matrix Multiplication Benchmark (11 Versions)\n");
    printf("====================================================\n");
    printf("   *** FIXED: Lock-Free Race Condition Eliminated ***\n");
    printf("   *** Using an Atomic Job Counter for Reliable Synchronization ***\n");
    printf("====================================================\n");


    int sizes[] = {256, 512, 1024, 2048};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

    for (int s = 0; s < num_sizes; s++) {
        int size = sizes[s];
        int N = size;
        int K = size;
        int M = size;

        initialize_thread_pool(N);

        printf("======================================================================\n");
        printf("Matrix size: %d x %d | Workers Used: %d | TILE_SIZE: %d\n",
               size, size, g_num_workers, TILE_SIZE);
        printf("SIMD Status: %s\n", USE_NEON_SIMD ? "NEON Active (Manual Intrinsic)" : "SIMD Disabled (C fallback)");
        printf("======================================================================\n");

        float *A = generate_random_matrix(N, K);
        float *B = generate_random_matrix(K, M);
        float *B_T = transpose_matrix(B, K, M);

        printf("\n🔥 Benchmark (5 measured, best time):\n\n");

        double sequential_baseline_time = 0.0;

        // 2. Reordered (ikj) - Sequential (V2) - BASELINE
        sequential_baseline_time = benchmark_runner("2. Reordered (ikj) Sequential", multiply_v2_sequential_wrapper, A, B, B_T, N, K, M, 0);
        printf(" 2. Reordered (ikj) Sequential:   %.3fs <--- BASELINE\n", sequential_baseline_time);


        // --- Parallel Implementations ---

        // 7. Reordered (ikj) Parallel
        double v7_time = benchmark_runner("7. Reordered (ikj) Parallel: ", multiply_v2_parallel_wrapper, A, B, B_T, N, K, M, 0);
        printf(" 7. Reordered (ikj) Parallel:     %.3fs (%.2fx)\n", v7_time, sequential_baseline_time / v7_time);

        // 8. Tiled (ijk) Parallel (V3)
        double v8_time = benchmark_runner("8. Tiled (ijk) Parallel: ", multiply_v3_parallel_wrapper, A, B, B_T, N, K, M, TILE_SIZE);
        printf(" 8. Tiled (ijk) Parallel:         %.3fs (%.2fx)\n", v8_time, sequential_baseline_time / v8_time);

        // 9. OpenBLAS cblas_sgemm (Library Standard)
        double v9_time = benchmark_runner("9. OpenBLAS cblas_sgemm: ", multiply_cblas_sgemm_wrapper, A, B, B_T, N, K, M, 0);
        printf(" 9. OpenBLAS cblas_sgemm:         %.3fs (%.2fx) - LIBRARY\n", v9_time, sequential_baseline_time / v9_time);

        // 11. SIMD + Transposed Parallel (V11)
        double v11_time = benchmark_runner("11. SIMD + Transposed Parallel: ", multiply_v11_parallel_wrapper, A, B, B_T, N, K, M, 0);
        printf("11. SIMD + Transposed Parallel:   %.3fs (%.2fx) - NEON\n", v11_time, sequential_baseline_time / v11_time);


        // Clean up memory
        free(A);
        free(B);
        free(B_T);

        cleanup_thread_pool();

        printf("\n");
    }

    printf("====================================================\n");
    printf("Benchmark Complete.\n");

    return 0;
 }

 // --- Implementation details for Block Functions (Warnings suppressed) ---

 void multiply_naive_ijk_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
    (void)N;
    (void)tile_size;
    for (int i = row_start; i < row_end; i++) {
        for (int j = 0; j < M; j++) {
            float sum = 0.0f;
            for (int k = 0; k < K; k++) { sum += A[i * K + k] * B[k * M + j]; }
            C[i * M + j] = sum;
        }
    }
 }
 void multiply_reordered_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
    (void)N;
    (void)tile_size;
    for (int i = row_start; i < row_end; i++) {
        int c_row_offset = i * M;
        for (int k = 0; k < K; k++) {
            float a_val = A[i * K + k];
            int b_row_offset = k * M;
            for (int j = 0; j < M; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; }
        }
    }
 }
 void multiply_tiled_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int TILE_SIZE_ARG) {
    (void)N;
    for (int i_tile = row_start; i_tile < row_end; i_tile += TILE_SIZE_ARG) {
        for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE_ARG) {
            for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE_ARG) {
                int i_end = min(i_tile + TILE_SIZE_ARG, row_end);
                int j_end = min(j_tile + TILE_SIZE_ARG, M);
                int k_end = min(k_tile + TILE_SIZE_ARG, K);
                for (int i = i_tile; i < i_end; i++) {
                    int c_row_offset = i * M;
                    for (int k = k_tile; k < k_end; k++) {
                        float a_val = A[i * K + k];
                        int b_row_offset = k * M;
                        for (int j = j_tile; j < j_end; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; }
                    }
                }
            }
        }
    }
 }
 void multiply_optimized_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
    (void)N;
    (void)tile_size;
    for (int i = row_start; i < row_end; i++) {
        int c_row_offset = i * M;
        for (int k = 0; k < K; k++) {
            float a_val = A[i * K + k];
            int b_row_offset = k * M;
            int j = 0;
            // Unrolled loop (V7 uses this)
            for (; j + 4 <= M; j += 4) {
                C[c_row_offset + j + 0] += a_val * B[b_row_offset + j + 0];
                C[c_row_offset + j + 1] += a_val * B[b_row_offset + j + 1];
                C[c_row_offset + j + 2] += a_val * B[b_row_offset + j + 2];
                C[c_row_offset + j + 3] += a_val * B[b_row_offset + j + 3];
            }
            for (; j < M; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; }
        }
    }
 }
 void multiply_simd_transposed_block(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
    (void)N;
    (void)tile_size;

    for (int i = row_start; i < row_end; i++) {
        const float *A_row = A + i * K;
        float *C_row = C + i * M;

        for (int j = 0; j < M; j++) {
            const float *B_T_row = B_T + j * K;
            float sum = 0.0f;
            int k = 0;

            #if USE_NEON_SIMD
            // NEON Vectorization (4 floats at a time)
            float32x4_t dot_sum_v = vdupq_n_f32(0.0f);

            for (; k + VECTOR_LANE_COUNT <= K; k += VECTOR_LANE_COUNT) {
                // Load 4 floats from A row
                float32x4_t a_v = vld1q_f32(A_row + k);
                // Load 4 floats from B_T row (which is contiguous after transpose)
                float32x4_t b_t_v = vld1q_f32(B_T_row + k);

                // Multiply-accumulate: dot_sum_v = (a_v * b_t_v) + dot_sum_v
                dot_sum_v = vmlaq_f32(dot_sum_v, a_v, b_t_v);
            }

            // Horizontal sum: Sum up the 4 elements in the accumulator vector
            // Use vpadd, vpadd, vgetq_lane, vget_low/high.
            // Simplified for modern compilers/intrinsics:
            float32x2_t sum_pair = vpadd_f32(vget_low_f32(dot_sum_v), vget_high_f32(dot_sum_v));
            sum_pair = vpadd_f32(sum_pair, sum_pair); // Sum the two elements
            sum = vget_lane_f32(sum_pair, 0);

            #endif // USE_NEON_SIMD

            // Scalar Cleanup Loop (Handles remaining elements if K is not a multiple of 4)
            for (; k < K; k++) {
                sum += A_row[k] * B_T_row[k];
            }

            C_row[j] = sum;
        }
    }
 }


 // --- Implementation details for Utility Functions (Sequential wrappers and others) ---
 void multiply_v1_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size; (void)B_T; multiply_naive_ijk_block(A, B, C, N, K, M, 0, N, 0);
 }
 void multiply_v2_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size; (void)B_T; multiply_reordered_ikj_block(A, B, C, N, K, M, 0, N, 0);
 }
 void multiply_v3_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)B_T; multiply_tiled_block(A, B, C, N, K, M, 0, N, tile_size);
 }
 void multiply_v7_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size; (void)B_T; multiply_optimized_ikj_block(A, B, C, N, K, M, 0, N, 0);
 }
 void multiply_v11_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)tile_size; (void)B; multiply_simd_transposed_block(A, B_T, C, N, K, M, 0, N, 0);
 }

 void multiply_strassen_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T) {
    (void)A; (void)B; (void)C; (void)N; (void)K; (void)M; (void)tile_size; (void)B_T;
 }

 int min(int a, int b) { return (a < b) ? a : b; }

 float *generate_random_matrix(int rows, int cols) {
    // Ensuring 16-byte alignment for NEON
    float *matrix = NULL;
    size_t total_bytes = (size_t)rows * cols * sizeof(float);
    // Use posix_memalign for 16-byte alignment, required for efficient SIMD loads
    if (posix_memalign((void**)&matrix, 16, total_bytes) != 0) {
        matrix = (float *)malloc(total_bytes);
        if (matrix != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc (alignment may be suboptimal).\n");
    }
    if (matrix == NULL) { perror("Memory allocation failed"); exit(EXIT_FAILURE); }
    for (size_t i = 0; i < (size_t)rows * cols; i++) { matrix[i] = (float)rand() / (float)RAND_MAX; }
    return matrix;
 }

 float *transpose_matrix(const float *B, int K, int M) {
    // Ensuring 16-byte alignment for NEON
    float *B_T = NULL;
    size_t total_bytes = (size_t)M * K * sizeof(float);
    if (posix_memalign((void**)&B_T, 16, total_bytes) != 0) {
        B_T = (float *)malloc(total_bytes);
        if (B_T != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc for transpose (alignment may be suboptimal).\n");
    }
    if (B_T == NULL) { perror("Memory allocation failed"); exit(EXIT_FAILURE); }
    for (int i = 0; i < K; i++) {
        for (int j = 0; j < M; j++) { B_T[j * K + i] = B[i * M + j]; }
    }
    return B_T;
 }

 double benchmark_runner(const char *name, mm_func mm_function, const float *A, const float *B, const float *B_T, int N, int K, int M, int tile_size) {
    (void)name;
    double best_time = 0.0;
    float *C = NULL;
    size_t total_bytes = (size_t)N * M * sizeof(float);
    if (posix_memalign((void**)&C, 16, total_bytes) != 0) {
        C = (float *)malloc(total_bytes);
        if (C != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc for C (alignment may be suboptimal).\n");
    }
    if (C == NULL) { perror("Measurement memory allocation failed"); exit(EXIT_FAILURE); }

    for (int i = 0; i < 5; i++) {
        // Must clear C matrix before each run
        memset(C, 0, total_bytes);

        clock_t start = clock();
        mm_function(A, B, C, N, K, M, tile_size, B_T);
        clock_t end = clock();

        double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC;

        if (i == 0 || elapsed_sec < best_time) {
            best_time = elapsed_sec;
        }
    }
    free(C);
    return best_time;
 }
diff --git a/VectorizationTest.java b/VectorizationTest.java
 public class VectorizationTest {

  public static void main(String[] args) {
    System.out.println("Testing Java Auto-Vectorization on M1\n");

    int size = 2048;
    float[][] A = randomMatrix(size, size);
    float[][] B = randomMatrix(size, size);

    // Warmup - let JIT do its magic
    System.out.println("Warming up JIT compiler...");
    for (int i = 0; i < 5; i++) {
      scalarVersion(A, B);
      vectorFriendly(A, B);
    }
    System.out.println("Warmup complete!\n");

    // Test 1: Scalar-friendly (hard to vectorize)
    long start = System.nanoTime();
    float[][] C1 = scalarVersion(A, B);
    double scalarTime = (System.nanoTime() - start) / 1e9;
    System.out.printf("1. Scalar-friendly version:  %.3f seconds\n", scalarTime);

    // Test 2: Vector-friendly (easy to vectorize)
    start = System.nanoTime();
    float[][] C2 = vectorFriendly(A, B);
    double vectorTime = (System.nanoTime() - start) / 1e9;
    System.out.printf("2. Vector-friendly version:  %.3f seconds\n", vectorTime);

    System.out.printf("\nSpeedup: %.2fx\n", scalarTime / vectorTime);

    if (vectorTime < scalarTime * 0.9) {
      System.out.println("✅ Likely vectorized! JIT is working!");
    } else {
      System.out.println("❌ No significant vectorization detected");
    }

    // Test 3: Manual unrolling
    start = System.nanoTime();
    float[][] C3 = manualUnroll4(A, B);
    double unrollTime = (System.nanoTime() - start) / 1e9;
    System.out.printf("\n3. Manual unroll (4x):       %.3f seconds (%.2fx)\n", unrollTime, scalarTime / unrollTime);

    // Test 4: Aggressive unrolling
    start = System.nanoTime();
    float[][] C4 = manualUnroll8(A, B);
    double unroll8Time = (System.nanoTime() - start) / 1e9;
    System.out.printf("4. Manual unroll (8x):       %.3f seconds (%.2fx)\n", unroll8Time, scalarTime / unroll8Time);
  }

  // Version 1: Hard to vectorize (dependencies)
  static float[][] scalarVersion(float[][] A, float[][] B) {
    int n = A.length;
    float[][] C = new float[n][n];

    for (int i = 0; i < n; i++) {
      for (int j = 0; j < n; j++) {
        float sum = 0;
        for (int k = 0; k < n; k++) {
          sum += A[i][k] * B[k][j]; // Reduces to single value
        }
        C[i][j] = sum;
      }
    }
    return C;
  }

  // Version 2: Easy to vectorize (independent operations)
  static float[][] vectorFriendly(float[][] A, float[][] B) {
    int n = A.length;
    float[][] C = new float[n][n];

    for (int i = 0; i < n; i++) {
      for (int k = 0; k < n; k++) {
        float aVal = A[i][k];
        // This inner loop is vectorizable!
        // Each C[i][j] update is independent
        for (int j = 0; j < n; j++) {
          C[i][j] += aVal * B[k][j];
        }
      }
    }
    return C;
  }

  // Version 3: Manual unroll 4x
  static float[][] manualUnroll4(float[][] A, float[][] B) {
    int n = A.length;
    float[][] C = new float[n][n];

    for (int i = 0; i < n; i++) {
      for (int k = 0; k < n; k++) {
        float aVal = A[i][k];
        int j = 0;

        // Process 4 at a time
        for (; j <= n - 4; j += 4) {
          C[i][j] += aVal * B[k][j];
          C[i][j + 1] += aVal * B[k][j + 1];
          C[i][j + 2] += aVal * B[k][j + 2];
          C[i][j + 3] += aVal * B[k][j + 3];
        }

        // Remainder
        for (; j < n; j++) {
          C[i][j] += aVal * B[k][j];
        }
      }
    }
    return C;
  }

  // Version 4: Manual unroll 8x
  static float[][] manualUnroll8(float[][] A, float[][] B) {
    int n = A.length;
    float[][] C = new float[n][n];

    for (int i = 0; i < n; i++) {
      for (int k = 0; k < n; k++) {
        float aVal = A[i][k];
        int j = 0;

        // Process 8 at a time
        for (; j <= n - 8; j += 8) {
          C[i][j] += aVal * B[k][j];
          C[i][j + 1] += aVal * B[k][j + 1];
          C[i][j + 2] += aVal * B[k][j + 2];
          C[i][j + 3] += aVal * B[k][j + 3];
          C[i][j + 4] += aVal * B[k][j + 4];
          C[i][j + 5] += aVal * B[k][j + 5];
          C[i][j + 6] += aVal * B[k][j + 6];
          C[i][j + 7] += aVal * B[k][j + 7];
        }

        // Remainder
        for (; j < n; j++) {
          C[i][j] += aVal * B[k][j];
        }
      }
    }
    return C;
  }

  static float[][] randomMatrix(int rows, int cols) {
    float[][] matrix = new float[rows][cols];
    for (int i = 0; i < rows; i++) {
      for (int j = 0; j < cols; j++) {
        matrix[i][j] = (float) Math.random();
      }
    }
    return matrix;
  }
 }
	// blas.dart
	import 'dart:ffi' as ffi;
	import 'dart:typed_data';
	import 'package:ffi/ffi.dart'; // Need this package!

	// Define the cblas_sgemm signature
	typedef CblasSgemmNative =
	ffi.Void Function(
	ffi.Int32 Order,
	ffi.Int32 TransA,
	ffi.Int32 TransB,
	ffi.Int32 M,
	ffi.Int32 N,
	ffi.Int32 K,
	ffi.Float alpha,
	ffi.Pointer<ffi.Float> A,
	ffi.Int32 lda,
	ffi.Pointer<ffi.Float> B,
	ffi.Int32 ldb,
	ffi.Float beta,
	ffi.Pointer<ffi.Float> C,
	ffi.Int32 ldc,
	);

	typedef CblasSgemmDart =
	void Function(
	int Order,
	int TransA,
	int TransB,
	int M,
	int N,
	int K,
	double alpha,
	ffi.Pointer<ffi.Float> A,
	int lda,
	ffi.Pointer<ffi.Float> B,
	int ldb,
	double beta,
	ffi.Pointer<ffi.Float> C,
	int ldc,
	);

	void main() {
	// Load Accelerate framework
	final accelerate = ffi.DynamicLibrary.open(
	'/System/Library/Frameworks/Accelerate.framework/Accelerate',
	);

	// Lookup cblas_sgemm
	final cblas_sgemm = accelerate
	.lookup<ffi.NativeFunction<CblasSgemmNative>>('cblas_sgemm')
	.asFunction<CblasSgemmDart>();

	const n = 1024;
	const size = n * n;

	// Allocate memory using calloc (from package:ffi)
	final A = calloc<ffi.Float>(size);
	final B = calloc<ffi.Float>(size);
	final C = calloc<ffi.Float>(size);

	// Fill with random data
	for (int i = 0; i < size; i++) {
	A[i] = 0.5;
	B[i] = 0.5;
	}

	final stopwatch = Stopwatch()..start();

	cblas_sgemm(
	101, // CblasRowMajor
	111, // CblasNoTrans
	111, // CblasNoTrans
	n,
	n,
	n,
	1.0,
	A,
	n,
	B,
	n,
	0.0,
	C,
	n,
	);

	stopwatch.stop();
	print(
	'Dart + Accelerate: ${(stopwatch.elapsedMicroseconds / 1000000).toStringAsFixed(3)}s',
	);

	// Cleanup
	calloc.free(A);
	calloc.free(B);
	calloc.free(C);
	}
	package main

	/*
	#cgo LDFLAGS: -framework Accelerate
	#include <Accelerate/Accelerate.h>
	*/
	import "C"
	import (
	"fmt"
	"math/rand"
	"time"
	"unsafe"
	)

	func multiplyBLAS(A, B, C []float32, n int) {
	// Convert Go slices to C pointers
	pA := (*C.float)(unsafe.Pointer(&A[0]))
	pB := (*C.float)(unsafe.Pointer(&B[0]))
	pC := (*C.float)(unsafe.Pointer(&C[0]))

	// Call cblas_sgemm from Accelerate
	C.cblas_sgemm(
	C.CblasRowMajor,
	C.CblasNoTrans,
	C.CblasNoTrans,
	C.int(n),
	C.int(n),
	C.int(n),
	1.0,
	pA,
	C.int(n),
	pB,
	C.int(n),
	0.0,
	pC,
	C.int(n),
	)
	}

	func main() {
	n := 1024

	A := make([]float32, n*n)
	B := make([]float32, n*n)
	C := make([]float32, n*n)

	// Fill with random data
	for i := range A {
	A[i] = rand.Float32()
	B[i] = rand.Float32()
	}

	start := time.Now()
	multiplyBLAS(A, B, C, n)
	elapsed := time.Since(start)

	fmt.Printf("Go + Accelerate: %.3fs\n", elapsed.Seconds())
	}
	import { dlopen, FFIType, suffix } from 'bun:ffi';

	// Load Accelerate framework
	const lib = dlopen('/System/Library/Frameworks/Accelerate.framework/Accelerate', {
	cblas_sgemm: {
	args: [
	FFIType.i32, // Order
	FFIType.i32, // TransA
	FFIType.i32, // TransB
	FFIType.i32, // M
	FFIType.i32, // N
	FFIType.i32, // K
	FFIType.f32, // alpha
	FFIType.ptr, // A
	FFIType.i32, // lda
	FFIType.ptr, // B
	FFIType.i32, // ldb
	FFIType.f32, // beta
	FFIType.ptr, // C
	FFIType.i32, // ldc
	],
	returns: FFIType.void,
	},
	});

	function multiplyBLAS(n) {
	const size = n * n;

	// Allocate aligned memory
	const A = new Float32Array(size);
	const B = new Float32Array(size);
	const C = new Float32Array(size);

	// Fill with random data
	for (let i = 0; i < size; i++) {
	A[i] = Math.random();
	B[i] = Math.random();
	}

	const start = performance.now();

	lib.symbols.cblas_sgemm(
	101, // CblasRowMajor
	111, // CblasNoTrans
	111, // CblasNoTrans
	n,
	n,
	n,
	1.0,
	A,
	n,
	B,
	n,
	0.0,
	C,
	n,
	);

	const end = performance.now();
	console.log(`Bun + Accelerate: ${((end - start) / 1000).toFixed(3)}s`);
	}

	multiplyBLAS(1024);
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>
	#include <math.h>
	// REQUIRED for the M1/Apple Silicon native BLAS implementation (Accelerate Framework)
	// gcc -O3 -Wall -Wextra Matmul.c -o Matmul -lm -framework Accelerate

	#include "cblas.h"

	// --- Utility Functions ---

	/**
	* @brief Finds the minimum of two integers.
	*/
	int min(int a, int b) {
	return (a < b) ? a : b;
	}

	/**
	* @brief Generates a flat, row-major N x M matrix with random floats (0.0 to 1.0).
	* @param rows The number of rows (N).
	* @param cols The number of columns (M).
	* @return A pointer to the allocated float array. Must be freed by the caller.
	*/
	float *generate_random_matrix(int rows, int cols) {
	float matrix = (float )malloc(rows * cols * sizeof(float));
	if (matrix == NULL) {
	perror("Memory allocation failed for matrix");
	exit(EXIT_FAILURE);
	}

	for (int i = 0; i < rows * cols; i++) {
	// Generate random float between 0.0 and 1.0
	matrix[i] = (float)rand() / (float)RAND_MAX;
	}
	return matrix;
	}

	/**
	* @brief Creates the transpose of a matrix (B_T from B).
	* @param B The original K x M matrix.
	* @param K, M The dimensions of B.
	* @return A pointer to the allocated M x K transposed float array (B_T).
	*/
	float transpose_matrix(const float B, int K, int M) {
	float B_T = (float )malloc(M * K * sizeof(float));
	if (B_T == NULL) {
	perror("Memory allocation failed for transposed matrix");
	exit(EXIT_FAILURE);
	}

	for (int k = 0; k < K; k++) {
	for (int m = 0; m < M; m++) {
	// B[k][m] goes to B_T[m][k]
	B_T[m * K + k] = B[k * M + m];
	}
	}
	return B_T;
	}


	// --- Matrix Multiplication Versions ---

	// Define the function pointer type for matrix multiplication.
	// Use '__restrict' keyword for all pointers to assist the M1's compiler (clang).
	typedef void (mm_func)(const float __restrict, const float __restrict, float __restrict, int, int, int, int);

	/**
	* @brief Version 1: Naive (ijk order) matrix multiplication.
	*/
	void multiply_naive_ijk(const float __restrict A, const float __restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
	(void)tile_size; // Suppress unused parameter warning
	for (int i = 0; i < N; i++) {
	for (int j = 0; j < M; j++) {
	float sum = 0.0f;
	for (int k = 0; k < K; k++) {
	// A[i][k] * B[k][j]. B access is column-major (cache-unfriendly)
	sum += A[i * K + k] * B[k * M + j];
	}
	C[i * M + j] = sum;
	}
	}
	}

	/**
	* @brief Version 2: Reordered (ikj order - cache friendly for A and C).
	*/
	void multiply_reordered_ikj(const float __restrict A, const float __restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
	(void)tile_size; // Suppress unused parameter warning
	for (int i = 0; i < N; i++) {
	int c_row_offset = i * M;
	for (int k = 0; k < K; k++) {
	// Load A[i][k] once per inner loop
	float a_val = A[i * K + k];
	int b_row_offset = k * M;

	// Inner loop iterates over B row and C row (cache-friendly strides)
	for (int j = 0; j < M; j++) {
	C[c_row_offset + j] += a_val * B[b_row_offset + j];
	}
	}
	}
	}

	/**
	* @brief Version 3: Tiled/Blocked matrix multiplication (ikj order inside blocks).
	* Exploits L1/L2 cache locality with block size.
	* @param TILE_SIZE The size of the square tile to use.
	*/
	void multiply_tiled(const float __restrict A, const float __restrict B, float *__restrict C, int N, int K, int M, int TILE_SIZE) {
	for (int i_tile = 0; i_tile < N; i_tile += TILE_SIZE) {
	for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE) {
	for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE) {

	int i_end = min(i_tile + TILE_SIZE, N);
	int j_end = min(j_tile + TILE_SIZE, M);
	int k_end = min(k_tile + TILE_SIZE, K);

	for (int i = i_tile; i < i_end; i++) {
	int c_row_offset = i * M;
	for (int k = k_tile; k < k_end; k++) {
	float a_val = A[i * K + k];
	int b_row_offset = k * M;
	for (int j = j_tile; j < j_end; j++) {
	C[c_row_offset + j] += a_val * B[b_row_offset + j];
	}
	}
	}
	}
	}
	}
	}

	/**
	* @brief Version 4: Transposed Dot Product (A * B_T). M1 Optimized.
	* This function expects B to be pre-transposed and K/M to be swapped for B_T.
	* @param A Matrix N x K.
	* @param B_T Matrix M x K (transposed B).
	* @param C Matrix N x M.
	* @param N, K, M Dimensions.
	*/
	void multiply_transposed_dot(const float __restrict A, const float __restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) {
	(void)tile_size; // Suppress unused parameter warning
	// This turns the inner loop into a highly cache-friendly dot product of:
	// Row i of A (length K) and Row j of B_T (which is Column j of B, length K)
	for (int i = 0; i < N; i++) {
	for (int j = 0; j < M; j++) {
	float sum = 0.0f;
	int a_offset = i * K;
	int b_t_offset = j * K;

	// Inner loop iterates over K (Dot Product) - all accesses are sequential
	for (int k = 0; k < K; k++) {
	// A[i][k] * B_T[j][k]
	sum += A[a_offset + k] * B_T[b_t_offset + k];
	}
	C[i * M + j] = sum;
	}
	}
	}

	/**
	* @brief Version 5: Transposed Dot Product with Loop Unrolling (Factor 4).
	* Unrolls the innermost dot product loop (k-loop) by a factor of 4 to reduce
	* loop overhead and potentially expose more ILP (Instruction-Level Parallelism).
	* This is the version you requested to include!
	*/
	void multiply_transposed_unrolled(const float __restrict A, const float __restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) {
	(void)tile_size; // Suppress unused parameter warning

	// K_mod is the number of iterations that can be unrolled (must be a multiple of 4)
	int K_mod = K - (K % 4);

	for (int i = 0; i < N; i++) {
	for (int j = 0; j < M; j++) {
	float sum = 0.0f;
	int a_offset = i * K;
	int b_t_offset = j * K;

	int k = 0;

	// 1. Unrolled loop (k = 0 to K_mod - 1)
	for (; k < K_mod; k += 4) {
	// Calculate four multiplications and accumulations in one loop iteration
	sum += A[a_offset + k] * B_T[b_t_offset + k];
	sum += A[a_offset + k + 1] * B_T[b_t_offset + k + 1];
	sum += A[a_offset + k + 2] * B_T[b_t_offset + k + 2];
	sum += A[a_offset + k + 3] * B_T[b_t_offset + k + 3];
	}

	// 2. Cleanup loop (k = K_mod to K - 1) for remaining elements
	for (; k < K; k++) {
	sum += A[a_offset + k] * B_T[b_t_offset + k];
	}

	C[i * M + j] = sum;
	}
	}
	}

	/**
	* @brief Version 6: Apple Accelerate/BLAS Optimized Matmul (cblas_sgemm).
	* This is the vendor-optimized method for peak performance on M1/M2/M3 chips.
	* This function performs C = alpha * A * B + beta * C.
	*/
	void multiply_cblas_sgemm(const float __restrict A, const float __restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
	(void)tile_size; // Suppress unused parameter warning

	const float alpha = 1.0f; // Scale factor for A * B
	const float beta = 0.0f; // Scale factor for C (0.0 since C is calloc'd)

	// CblasRowMajor: Matrices A, B, and C are stored in row-major order (C-style)
	// CblasNoTrans: Neither A nor B is transposed
	cblas_sgemm(CblasRowMajor,
	CblasNoTrans,
	CblasNoTrans,
	// M, N, K refer to the dimensions in the CblasRowMajor context:
	N, // M: rows of A and C
	M, // N: columns of B and C
	K, // K: inner dimension (columns of A / rows of B)
	alpha,
	A,
	K, // lda (leading dimension of A): K columns
	B,
	M, // ldb (leading dimension of B): M columns
	beta,
	C,
	M); // ldc (leading dimension of C): M columns
	}


	// --- Benchmark Runner ---

	/**
	* @brief Benchmarks a matrix multiplication function.
	* Runs 3 warmup iterations and 5 measured iterations, returning the best time.
	* @param name The name of the test.
	* @param func The matrix multiplication function to test.
	* @param A, B The input matrices.
	* @param N, K, M The dimensions.
	* @param tile_size The tile size to pass (used only by tiled version).
	* @return The best time measured in seconds.
	*/
	double benchmark_runner(const char name, mm_func func, const float A, const float *B, int N, int K, int M, int tile_size) {

	// Silence 'unused parameter' warning, as printing is handled in main()
	(void)name;

	// Warmup (3 times)
	for (int i = 0; i < 3; i++) {
	// calloc zeroes memory which is necessary for C += ... operations
	float C_dummy = (float )calloc(N * M, sizeof(float));
	if (C_dummy == NULL) {
	perror("Warmup memory allocation failed");
	exit(EXIT_FAILURE);
	}

	func(A, B, C_dummy, N, K, M, tile_size);

	free(C_dummy);
	}

	// Measure (take best of 5)
	double best_time = 0.0;

	for (int i = 0; i < 5; i++) {
	// Allocate and zero the result matrix C for each run
	float C = (float )calloc(N * M, sizeof(float));
	if (C == NULL) {
	perror("Measurement memory allocation failed");
	exit(EXIT_FAILURE);
	}

	// Use clock() for CPU time measurement
	clock_t start = clock();

	func(A, B, C, N, K, M, tile_size);

	clock_t end = clock();

	free(C);

	double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC;

	if (i == 0 \|\| elapsed_sec < best_time) {
	best_time = elapsed_sec;
	}
	}

	return best_time;
	}

	// --- Main Program ---

	int main() {
	// Seed the random number generator once
	srand((unsigned int)time(NULL));

	printf("C17 Matrix Multiplication Benchmark (M1 Pro Optimized)\n");
	printf("====================================================\n\n");

	int sizes[] = {256, 512, 1024, 2048}; // Added 2048 to push the M1's cache limits
	int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
	int TILE_SIZE = 48; // Adjusted tile size to be slightly more aggressive for L1/L2 cache

	for (int s = 0; s < num_sizes; s++) {
	int size = sizes[s];

	// For square matrices, N=K=M=size
	int N = size;
	int K = size;
	int M = size;

	printf("======================================================================\n");
	printf("Matrix size: %d x %d (Total Data: %.1f MB)\n", size, size, (double)(3.0 * size * size * sizeof(float)) / (1024.0 * 1024.0));
	printf("======================================================================\n");

	// Generate matrices A and B
	float *A = generate_random_matrix(N, K);
	float *B = generate_random_matrix(K, M);

	// Allocate B_T once for transposed benchmarks
	printf("\nPreprocessing (Transpose B)... ");
	float *B_T = transpose_matrix(B, K, M);
	printf("Done.\n");

	printf("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):\n\n");

	double reordered_time = 0.0;
	double current_time = 0.0;
	double transposed_time = 0.0;
	double unrolled_time = 0.0;
	double cblas_time = 0.0;


	// 1. Naive (ijk) - Skip for large matrices
	if (size <= 512) {
	current_time = benchmark_runner("1. Naive (ijk)", multiply_naive_ijk, A, B, N, K, M, 0);
	printf("1. Naive (ijk): %.3fs\n", current_time);
	} else {
	printf("1. Naive (ijk): [skipped - too slow]\n");
	}


	// 2. Reordered (ikj) - BASELINE
	reordered_time = benchmark_runner("2. Reordered (ikj)", multiply_reordered_ikj, A, B, N, K, M, 0);
	printf("2. Reordered (ikj): %.3fs\n", reordered_time);

	// 3. Tiled/Blocked
	current_time = benchmark_runner("3. Tiled", multiply_tiled, A, B, N, K, M, TILE_SIZE);
	printf("3. Tiled (%d): %.3fs (%.2fx)\n",
	TILE_SIZE, current_time, reordered_time / current_time);

	// 4. Transposed Dot Product (using B_T)
	transposed_time = benchmark_runner("4. Transposed Dot Product", multiply_transposed_dot, A, B_T, N, K, M, 0);
	printf("4. Transposed Dot Product: %.3fs (%.2fx)\n",
	transposed_time, reordered_time / transposed_time);

	// 5. Transposed Dot Product with Unrolling (using B_T)
	unrolled_time = benchmark_runner("5. Transposed Unrolled Dot", multiply_transposed_unrolled, A, B_T, N, K, M, 0);
	printf("5. Transposed Unrolled Dot (x4): %.3fs (%.2fx)\n",
	unrolled_time, reordered_time / unrolled_time);

	// 6. M1 Native BLAS (cblas_sgemm)
	cblas_time = benchmark_runner("6. M1 Native BLAS (cblas_sgemm)", multiply_cblas_sgemm, A, B, N, K, M, 0);
	printf("6. M1 Native BLAS (cblas_sgemm): %.3fs (%.2fx) <--- NATIVE SPEED\n",
	cblas_time, reordered_time / cblas_time);


	// Clean up memory for the current size
	free(A);
	free(B);
	free(B_T); // Free the transposed matrix

	printf("\n");
	}

	printf("====================================================\n");
	printf("Benchmark Complete.\n");

	return 0;
	}
	import 'dart:math';
	import 'dart:typed_data';
	import 'dart:io';

	// A record to hold the name and time of a benchmark result.
	typedef BenchmarkResult = ({String name, double time});

	/// Generates a matrix of a given size with random double values.
	List<List<double>> randomMatrix(int rows, int cols) {
	final random = Random();
	return List.generate(
	rows,
	(_) => List.generate(cols, (_) => random.nextDouble()),
	);
	}

	/// Version 1: Naive (ijk order) matrix multiplication.
	List<List<double>> naiveMultiply(List<List<double>> A, List<List<double>> B) {
	final n = A.length;
	final k = A[0].length;
	final m = B[0].length;
	final C = List.generate(n, (_) => List.filled(m, 0.0));

	for (int i = 0; i < n; i++) {
	for (int j = 0; j < m; j++) {
	double sum = 0.0;
	for (int kIdx = 0; kIdx < k; kIdx++) {
	sum += A[i][kIdx] * B[kIdx][j];
	}
	C[i][j] = sum;
	}
	}
	return C;
	}

	/// Version 2: Reordered (ikj order) for better cache performance.
	List<List<double>> reorderedMultiply(
	List<List<double>> A,
	List<List<double>> B,
	) {
	final n = A.length;
	final k = A[0].length;
	final m = B[0].length;
	final C = List.generate(n, (_) => List.filled(m, 0.0));

	for (int i = 0; i < n; i++) {
	for (int kIdx = 0; kIdx < k; kIdx++) {
	final aVal = A[i][kIdx];
	for (int j = 0; j < m; j++) {
	C[i][j] += aVal * B[kIdx][j];
	}
	}
	}
	return C;
	}

	/// Version 3: Tiled/Blocked matrix multiplication.
	List<List<double>> tiledMultiply(
	List<List<double>> A,
	List<List<double>> B,
	int tileSize,
	) {
	final n = A.length;
	final k = A[0].length;
	final m = B[0].length;
	final C = List.generate(n, (_) => List.filled(m, 0.0));

	for (int iTile = 0; iTile < n; iTile += tileSize) {
	for (int jTile = 0; jTile < m; jTile += tileSize) {
	for (int kTile = 0; kTile < k; kTile += tileSize) {
	final iEnd = min(iTile + tileSize, n);
	final jEnd = min(jTile + tileSize, m);
	final kEnd = min(kTile + tileSize, k);

	for (int i = iTile; i < iEnd; i++) {
	for (int kIdx = kTile; kIdx < kEnd; kIdx++) {
	final aVal = A[i][kIdx];
	for (int j = jTile; j < jEnd; j++) {
	C[i][j] += aVal * B[kIdx][j];
	}
	}
	}
	}
	}
	}
	return C;
	}

	/// Version 4: Manual loop unrolling by a factor of 4.
	List<List<double>> unrolled4Multiply(
	List<List<double>> A,
	List<List<double>> B,
	) {
	final n = A.length;
	final k = A[0].length;
	final m = B[0].length;
	final C = List.generate(n, (_) => List.filled(m, 0.0));

	for (int i = 0; i < n; i++) {
	for (int kIdx = 0; kIdx < k; kIdx++) {
	final aVal = A[i][kIdx];
	int j = 0;

	// Process 4 elements at a time
	for (; j <= m - 4; j += 4) {
	C[i][j] += aVal * B[kIdx][j];
	C[i][j + 1] += aVal * B[kIdx][j + 1];
	C[i][j + 2] += aVal * B[kIdx][j + 2];
	C[i][j + 3] += aVal * B[kIdx][j + 3];
	}

	// Handle the remainder
	for (; j < m; j++) {
	C[i][j] += aVal * B[kIdx][j];
	}
	}
	}
	return C;
	}

	/// Version 5: Uses flat Float32List to avoid 2D list overhead.
	Float32List flatArrayMultiply(
	Float32List A,
	Float32List B,
	int n,
	int k,
	int m,
	) {
	final C = Float32List(n * m); // Automatically initialized to 0

	for (int i = 0; i < n; i++) {
	for (int kIdx = 0; kIdx < k; kIdx++) {
	final aVal = A[i * k + kIdx];
	final bRowOffset = kIdx * m;
	final cRowOffset = i * m;

	for (int j = 0; j < m; j++) {
	C[cRowOffset + j] += aVal * B[bRowOffset + j];
	}
	}
	}
	return C;
	}

	/// Helper to convert a 2D list to a flat Float32List.
	Float32List toFlatList(List<List<double>> matrix) {
	final rows = matrix.length;
	if (rows == 0) return Float32List(0);
	final cols = matrix[0].length;
	final flat = Float32List(rows * cols);

	for (int i = 0; i < rows; i++) {
	for (int j = 0; j < cols; j++) {
	flat[i * cols + j] = matrix[i][j];
	}
	}
	return flat;
	}

	/// Runs a function multiple times to allow the JIT compiler to warm up.
	void warmupFunction(String name, void Function() fn) {
	print(' Warming up: $name...');
	const iterations = 10;
	for (int i = 0; i < iterations; i++) {
	fn();
	}
	}

	/// Benchmarks a function by running it multiple times and taking the minimum time.
	BenchmarkResult benchmark(String name, Function fn) {
	const iterations = 5;
	final times = <double>[];
	final stopwatch = Stopwatch();

	for (int i = 0; i < iterations; i++) {
	stopwatch.start();
	fn();
	stopwatch.stop();
	// Convert microseconds to seconds
	times.add(stopwatch.elapsedMicroseconds / 1000000.0);
	stopwatch.reset();
	}

	// Take the minimum time (best run = least interference)
	final time = times.reduce(min);

	return (name: name, time: time);
	}

	/// Main test runner with proper warmup.
	void runTests() {
	print('Dart Matrix Multiplication - PROPER WARMUP VERSION\n');
	print('Runtime: Dart ${Platform.version.split(' ').first}');
	print('Platform: ${Platform.operatingSystem}\n');

	final sizes = [256, 512, 1024];

	for (final size in sizes) {
	print('='.padRight(70, '='));
	final mem = (3 * size * size * 4) / (1024 * 1024);
	print('Matrix size: ${size} x ${size} (${mem.toStringAsFixed(1)} MB)');
	print('='.padRight(70, '='));

	final A = randomMatrix(size, size);
	final B = randomMatrix(size, size);
	final A_flat = toFlatList(A);
	final B_flat = toFlatList(B);

	print('\n🔥 WARMUP PHASE (10 iterations each):');

	// Warmup all functions
	if (size <= 512) {
	warmupFunction('naiveMultiply', () => naiveMultiply(A, B));
	}
	warmupFunction('reorderedMultiply', () => reorderedMultiply(A, B));
	warmupFunction('tiledMultiply', () => tiledMultiply(A, B, 64));
	warmupFunction('unrolled4Multiply', () => unrolled4Multiply(A, B));
	warmupFunction(
	'flatArrayMultiply',
	() => flatArrayMultiply(A_flat, B_flat, size, size, size),
	);

	print('\n⚡ BENCHMARKING (5 runs, taking best):');

	final results = <BenchmarkResult>[];

	// Test 1: Naive
	if (size <= 512) {
	final r1 = benchmark('Naive (ijk)', () => naiveMultiply(A, B));
	results.add(r1);
	print('1. Naive (ijk): ${r1.time.toStringAsFixed(3)}s');
	} else {
	print('1. Naive (ijk): [skipped - too slow]');
	}

	// Test 2: Reordered (Baseline)
	final r2 = benchmark('Reordered (ikj)', () => reorderedMultiply(A, B));
	results.add(r2);
	print('2. Reordered (ikj): ${r2.time.toStringAsFixed(3)}s');
	final baseTime = r2.time;

	// Test 3: Tiled
	const tileSize = 64;
	final r3 = benchmark(
	'Tiled (${tileSize}x$tileSize)',
	() => tiledMultiply(A, B, tileSize),
	);
	results.add(r3);
	print(
	'3. Tiled ($tileSize): ${r3.time.toStringAsFixed(3)}s (${(baseTime / r3.time).toStringAsFixed(2)}x)',
	);

	// Test 4: Unrolled
	final r4 = benchmark('Unrolled 4x', () => unrolled4Multiply(A, B));
	results.add(r4);
	print(
	'4. Unrolled 4x: ${r4.time.toStringAsFixed(3)}s (${(baseTime / r4.time).toStringAsFixed(2)}x)',
	);

	// Test 5: Flat arrays (TypedData)
	final r5 = benchmark(
	'Flat Float32List',
	() => flatArrayMultiply(A_flat, B_flat, size, size, size),
	);
	results.add(r5);
	print(
	'5. Flat Float32List: ${r5.time.toStringAsFixed(3)}s (${(baseTime / r5.time).toStringAsFixed(2)}x)',
	);

	// Find the best performing version
	final best = results.reduce((a, b) => a.time < b.time ? a : b);
	final baseline = results.firstWhere((r) => r.name == 'Reordered (ikj)');

	print('\n✨ Winner: ${best.name} (${best.time.toStringAsFixed(3)}s)');
	print(
	'📊 Speedup vs Reordered: ${(baseline.time / best.time).toStringAsFixed(2)}x\n',
	);
	}
	}

	void main() {
	runTests();
	}
	public class Matmul {

	public static void main(String[] args) {
	System.out.println("Matrix Multiplication - Optimized for M1 (128-byte cache lines)\n");

	// Cache line holds 128 bytes / 4 bytes per float = 32 floats
	final int FLOATS_PER_CACHE_LINE = 32;

	System.out.println("M1 Architecture:");
	System.out.println(" Cache line size: 128 bytes");
	System.out.println(" Floats per cache line: " + FLOATS_PER_CACHE_LINE);
	System.out.println();

	int[] sizes = { 512, 1024, 2048 };

	for (int size : sizes) {
	System.out.println("=".repeat(70));
	System.out.printf("Matrix size: %d x %d (%.1f MB total)\n", size, size, (3.0 * size * size * 4) / (1024 * 1024));
	System.out.println("=".repeat(70));

	float[][] A = randomMatrix(size, size);
	float[][] B = randomMatrix(size, size);

	// 1. Naive (skip for large matrices)
	if (size <= 512) {
	long start = System.nanoTime();
	float[][] C1 = naiveMultiply(A, B);
	double naiveTime = (System.nanoTime() - start) / 1e9;
	System.out.printf("1. Naive: %.3f seconds (baseline)\n", naiveTime);
	} else {
	System.out.println("1. Naive: [skipped - too slow!]");
	}

	// 2. Reordered
	long start = System.nanoTime();
	float[][] C2 = reorderedMultiply(A, B);
	double reorderedTime = (System.nanoTime() - start) / 1e9;
	System.out.printf("2. Reordered: %.3f seconds\n", reorderedTime);

	// 3. Cache-line aligned (process 32 floats at a time)
	start = System.nanoTime();
	float[][] C3 = cacheLineAligned(A, B);
	double alignedTime = (System.nanoTime() - start) / 1e9;
	System.out.printf("3. Cache-line aligned: %.3f seconds (%.2fx vs reordered)\n", alignedTime, reorderedTime / alignedTime);

	// 4. Try different tile sizes (M1 has huge caches!)
	System.out.println("4. Tiled versions:");
	int[] tileSizes = { 64, 128, 256, 512 };
	double bestTime = Double.MAX_VALUE;
	int bestTile = 0;

	for (int tileSize : tileSizes) {
	start = System.nanoTime();
	float[][] C4 = tiledMultiply(A, B, tileSize);
	double tiledTime = (System.nanoTime() - start) / 1e9;
	System.out.printf(" Tile size %4d: %.3f seconds (%.2fx vs reordered)\n", tileSize, tiledTime, reorderedTime / tiledTime);

	if (tiledTime < bestTime) {
	bestTime = tiledTime;
	bestTile = tileSize;
	}
	}

	System.out.printf("\n✨ Best configuration: Tile size %d (%.2fx faster than reordered)\n\n", bestTile, reorderedTime / bestTime);
	}
	}

	// NAIVE
	static float[][] naiveMultiply(float[][] A, float[][] B) {
	int n = A.length,
	k = A[0].length,
	m = B[0].length;
	float[][] C = new float[n][m];
	for (int row = 0; row < n; row++) {
	for (int col = 0; col < m; col++) {
	for (int i = 0; i < k; i++) {
	C[row][col] += A[row][i] * B[i][col];
	}
	}
	}
	return C;
	}

	// REORDERED
	static float[][] reorderedMultiply(float[][] A, float[][] B) {
	int n = A.length,
	k = A[0].length,
	m = B[0].length;
	float[][] C = new float[n][m];
	for (int row = 0; row < n; row++) {
	for (int i = 0; i < k; i++) {
	float aValue = A[row][i];
	for (int col = 0; col < m; col++) {
	C[row][col] += aValue * B[i][col];
	}
	}
	}
	return C;
	}

	// CACHE-LINE ALIGNED: Process 32 floats at once (128 bytes)
	static float[][] cacheLineAligned(float[][] A, float[][] B) {
	int n = A.length,
	k = A[0].length,
	m = B[0].length;
	float[][] C = new float[n][m];

	final int BLOCK = 32; // 32 floats = 128 bytes = 1 cache line on M1

	for (int row = 0; row < n; row++) {
	for (int i = 0; i < k; i++) {
	float aValue = A[row][i];

	// Process 32 columns at a time (1 cache line)
	int col = 0;
	for (; col <= m - BLOCK; col += BLOCK) {
	// These 32 accesses hit the same cache line!
	for (int j = 0; j < BLOCK; j++) {
	C[row][col + j] += aValue * B[i][col + j];
	}
	}

	// Handle remaining columns
	for (; col < m; col++) {
	C[row][col] += aValue * B[i][col];
	}
	}
	}
	return C;
	}

	// TILED
	static float[][] tiledMultiply(float[][] A, float[][] B, int tileSize) {
	int n = A.length,
	k = A[0].length,
	m = B[0].length;
	float[][] C = new float[n][m];

	for (int rowTile = 0; rowTile < n; rowTile += tileSize) {
	for (int colTile = 0; colTile < m; colTile += tileSize) {
	for (int innerTile = 0; innerTile < k; innerTile += tileSize) {
	int rowEnd = Math.min(rowTile + tileSize, n);
	int colEnd = Math.min(colTile + tileSize, m);
	int innerEnd = Math.min(innerTile + tileSize, k);

	for (int row = rowTile; row < rowEnd; row++) {
	for (int i = innerTile; i < innerEnd; i++) {
	float aValue = A[row][i];
	for (int col = colTile; col < colEnd; col++) {
	C[row][col] += aValue * B[i][col];
	}
	}
	}
	}
	}
	}
	return C;
	}

	static float[][] randomMatrix(int rows, int cols) {
	float[][] matrix = new float[rows][cols];
	for (int i = 0; i < rows; i++) {
	for (int j = 0; j < cols; j++) {
	matrix[i][j] = (float) Math.random();
	}
	}
	return matrix;
	}
	}
	// Matrix Multiplication with PROPER WARMUP

	function randomMatrix(rows, cols) {
	const matrix = [];
	for (let i = 0; i < rows; i++) {
	matrix[i] = [];
	for (let j = 0; j < cols; j++) {
	matrix[i][j] = Math.random();
	}
	}
	return matrix;
	}

	// Version 1: Naive (ijk order)
	function naiveMultiply(A, B) {
	const n = A.length;
	const k = A[0].length;
	const m = B[0].length;
	const C = Array(n)
	.fill(0)
	.map(() => Array(m).fill(0));

	for (let i = 0; i < n; i++) {
	for (let j = 0; j < m; j++) {
	let sum = 0;
	for (let k_idx = 0; k_idx < k; k_idx++) {
	sum += A[i][k_idx] * B[k_idx][j];
	}
	C[i][j] = sum;
	}
	}
	return C;
	}

	// Version 2: Reordered (ikj order - cache friendly)
	function reorderedMultiply(A, B) {
	const n = A.length;
	const k = A[0].length;
	const m = B[0].length;
	const C = Array(n)
	.fill(0)
	.map(() => Array(m).fill(0));

	for (let i = 0; i < n; i++) {
	for (let k_idx = 0; k_idx < k; k_idx++) {
	const aVal = A[i][k_idx];
	for (let j = 0; j < m; j++) {
	C[i][j] += aVal * B[k_idx][j];
	}
	}
	}
	return C;
	}

	// Version 3: Tiled/Blocked
	function tiledMultiply(A, B, tileSize) {
	const n = A.length;
	const k = A[0].length;
	const m = B[0].length;
	const C = Array(n)
	.fill(0)
	.map(() => Array(m).fill(0));

	for (let iTile = 0; iTile < n; iTile += tileSize) {
	for (let jTile = 0; jTile < m; jTile += tileSize) {
	for (let kTile = 0; kTile < k; kTile += tileSize) {
	const iEnd = Math.min(iTile + tileSize, n);
	const jEnd = Math.min(jTile + tileSize, m);
	const kEnd = Math.min(kTile + tileSize, k);

	for (let i = iTile; i < iEnd; i++) {
	for (let k_idx = kTile; k_idx < kEnd; k_idx++) {
	const aVal = A[i][k_idx];
	for (let j = jTile; j < jEnd; j++) {
	C[i][j] += aVal * B[k_idx][j];
	}
	}
	}
	}
	}
	}
	return C;
	}

	// Version 4: Manual unroll 4x
	function unrolled4Multiply(A, B) {
	const n = A.length;
	const k = A[0].length;
	const m = B[0].length;
	const C = Array(n)
	.fill(0)
	.map(() => Array(m).fill(0));

	for (let i = 0; i < n; i++) {
	for (let k_idx = 0; k_idx < k; k_idx++) {
	const aVal = A[i][k_idx];
	let j = 0;

	// Process 4 at a time
	for (; j <= m - 4; j += 4) {
	C[i][j] += aVal * B[k_idx][j];
	C[i][j + 1] += aVal * B[k_idx][j + 1];
	C[i][j + 2] += aVal * B[k_idx][j + 2];
	C[i][j + 3] += aVal * B[k_idx][j + 3];
	}

	// Handle remainder
	for (; j < m; j++) {
	C[i][j] += aVal * B[k_idx][j];
	}
	}
	}
	return C;
	}

	// Version 5: Flat arrays (avoid 2D array overhead)
	function flatArrayMultiply(A, B, n, k, m) {
	const C = new Float32Array(n * m);

	for (let i = 0; i < n; i++) {
	for (let k_idx = 0; k_idx < k; k_idx++) {
	const aVal = A[i * k + k_idx];
	const bRowOffset = k_idx * m;
	const cRowOffset = i * m;

	for (let j = 0; j < m; j++) {
	C[cRowOffset + j] += aVal * B[bRowOffset + j];
	}
	}
	}
	return C;
	}

	// Helper to convert 2D to flat
	function toFlatArray(matrix) {
	const rows = matrix.length;
	const cols = matrix[0].length;
	const flat = new Float32Array(rows * cols);

	for (let i = 0; i < rows; i++) {
	for (let j = 0; j < cols; j++) {
	flat[i * cols + j] = matrix[i][j];
	}
	}
	return flat;
	}

	// PROPER warmup function
	function warmupFunction(fn, ...args) {
	console.log(` Warming up: ${fn.name}...`);
	const iterations = 10;
	for (let i = 0; i < iterations; i++) {
	fn(...args);
	}
	}

	// Benchmark function with minimal overhead
	function benchmark(name, fn, ...args) {
	const iterations = 5;
	const times = [];

	for (let i = 0; i < iterations; i++) {
	const start = performance.now();
	const result = fn(...args);
	const end = performance.now();
	times.push((end - start) / 1000);
	}

	// Take the minimum time (best run = least interference)
	const time = Math.min(...times);

	return { name, time };
	}

	// Main test with proper warmup
	function runTests() {
	console.log('JavaScript Matrix Multiplication - PROPER WARMUP VERSION\n');
	console.log(`Runtime: ${typeof process !== 'undefined' ? 'Node.js ' + process.version : 'Browser'}`);
	console.log(`Platform: ${typeof process !== 'undefined' ? process.platform + ' ' + process.arch : 'Browser'}\n`);

	const sizes = [256, 512, 1024];

	for (const size of sizes) {
	console.log('='.repeat(70));
	console.log(`Matrix size: ${size} x ${size} (${((3 * size * size * 4) / (1024 * 1024)).toFixed(1)} MB)`);
	console.log('='.repeat(70));

	const A = randomMatrix(size, size);
	const B = randomMatrix(size, size);
	const A_flat = toFlatArray(A);
	const B_flat = toFlatArray(B);

	console.log('\n🔥 WARMUP PHASE (10 iterations each):');

	// Warmup all functions
	if (size <= 512) {
	warmupFunction(naiveMultiply, A, B);
	}
	warmupFunction(reorderedMultiply, A, B);
	warmupFunction(tiledMultiply, A, B, 64);
	warmupFunction(unrolled4Multiply, A, B);
	warmupFunction(flatArrayMultiply, A_flat, B_flat, size, size, size);

	console.log('\n⚡ BENCHMARKING (5 runs, taking best):');

	const results = [];

	// Test 1: Naive
	if (size <= 512) {
	const r1 = benchmark('Naive (ijk)', naiveMultiply, A, B);
	results.push(r1);
	console.log(`1. Naive (ijk): ${r1.time.toFixed(3)}s`);
	} else {
	console.log(`1. Naive (ijk): [skipped - too slow]`);
	}

	// Test 2: Reordered
	const r2 = benchmark('Reordered (ikj)', reorderedMultiply, A, B);
	results.push(r2);
	console.log(`2. Reordered (ikj): ${r2.time.toFixed(3)}s`);
	const baseTime = r2.time;

	// Test 3: Tiled
	const tileSize = 64;
	const r3 = benchmark(`Tiled (${tileSize}x${tileSize})`, tiledMultiply, A, B, tileSize);
	results.push(r3);
	console.log(`3. Tiled (${tileSize}): ${r3.time.toFixed(3)}s (${(baseTime / r3.time).toFixed(2)}x)`);

	// Test 4: Unrolled
	const r4 = benchmark('Unrolled 4x', unrolled4Multiply, A, B);
	results.push(r4);
	console.log(`4. Unrolled 4x: ${r4.time.toFixed(3)}s (${(baseTime / r4.time).toFixed(2)}x)`);

	// Test 5: Flat arrays (TypedArray)
	const r5 = benchmark('Flat Float32Array', flatArrayMultiply, A_flat, B_flat, size, size, size);
	results.push(r5);
	console.log(`5. Flat Float32Array: ${r5.time.toFixed(3)}s (${(baseTime / r5.time).toFixed(2)}x)`);

	// Find best
	const best = results.reduce((a, b) => (a.time < b.time ? a : b));
	const baseline = results.find(r => r.name === 'Reordered (ikj)');

	console.log(`\n✨ Winner: ${best.name} (${best.time.toFixed(3)}s)`);
	console.log(`📊 Speedup vs Reordered: ${(baseline.time / best.time).toFixed(2)}x\n`);
	}
	}

	// Run the tests
	runTests();
	public class VectorizationTest {

	public static void main(String[] args) {
	System.out.println("Testing Java Auto-Vectorization on M1\n");

	int size = 2048;
	float[][] A = randomMatrix(size, size);
	float[][] B = randomMatrix(size, size);

	// Warmup - let JIT do its magic
	System.out.println("Warming up JIT compiler...");
	for (int i = 0; i < 5; i++) {
	scalarVersion(A, B);
	vectorFriendly(A, B);
	}
	System.out.println("Warmup complete!\n");

	// Test 1: Scalar-friendly (hard to vectorize)
	long start = System.nanoTime();
	float[][] C1 = scalarVersion(A, B);
	double scalarTime = (System.nanoTime() - start) / 1e9;
	System.out.printf("1. Scalar-friendly version: %.3f seconds\n", scalarTime);

	// Test 2: Vector-friendly (easy to vectorize)
	start = System.nanoTime();
	float[][] C2 = vectorFriendly(A, B);
	double vectorTime = (System.nanoTime() - start) / 1e9;
	System.out.printf("2. Vector-friendly version: %.3f seconds\n", vectorTime);

	System.out.printf("\nSpeedup: %.2fx\n", scalarTime / vectorTime);

	if (vectorTime < scalarTime * 0.9) {
	System.out.println("✅ Likely vectorized! JIT is working!");
	} else {
	System.out.println("❌ No significant vectorization detected");
	}

	// Test 3: Manual unrolling
	start = System.nanoTime();
	float[][] C3 = manualUnroll4(A, B);
	double unrollTime = (System.nanoTime() - start) / 1e9;
	System.out.printf("\n3. Manual unroll (4x): %.3f seconds (%.2fx)\n", unrollTime, scalarTime / unrollTime);

	// Test 4: Aggressive unrolling
	start = System.nanoTime();
	float[][] C4 = manualUnroll8(A, B);
	double unroll8Time = (System.nanoTime() - start) / 1e9;
	System.out.printf("4. Manual unroll (8x): %.3f seconds (%.2fx)\n", unroll8Time, scalarTime / unroll8Time);
	}

	// Version 1: Hard to vectorize (dependencies)
	static float[][] scalarVersion(float[][] A, float[][] B) {
	int n = A.length;
	float[][] C = new float[n][n];

	for (int i = 0; i < n; i++) {
	for (int j = 0; j < n; j++) {
	float sum = 0;
	for (int k = 0; k < n; k++) {
	sum += A[i][k] * B[k][j]; // Reduces to single value
	}
	C[i][j] = sum;
	}
	}
	return C;
	}

	// Version 2: Easy to vectorize (independent operations)
	static float[][] vectorFriendly(float[][] A, float[][] B) {
	int n = A.length;
	float[][] C = new float[n][n];

	for (int i = 0; i < n; i++) {
	for (int k = 0; k < n; k++) {
	float aVal = A[i][k];
	// This inner loop is vectorizable!
	// Each C[i][j] update is independent
	for (int j = 0; j < n; j++) {
	C[i][j] += aVal * B[k][j];
	}
	}
	}
	return C;
	}

	// Version 3: Manual unroll 4x
	static float[][] manualUnroll4(float[][] A, float[][] B) {
	int n = A.length;
	float[][] C = new float[n][n];

	for (int i = 0; i < n; i++) {
	for (int k = 0; k < n; k++) {
	float aVal = A[i][k];
	int j = 0;

	// Process 4 at a time
	for (; j <= n - 4; j += 4) {
	C[i][j] += aVal * B[k][j];
	C[i][j + 1] += aVal * B[k][j + 1];
	C[i][j + 2] += aVal * B[k][j + 2];
	C[i][j + 3] += aVal * B[k][j + 3];
	}

	// Remainder
	for (; j < n; j++) {
	C[i][j] += aVal * B[k][j];
	}
	}
	}
	return C;
	}

	// Version 4: Manual unroll 8x
	static float[][] manualUnroll8(float[][] A, float[][] B) {
	int n = A.length;
	float[][] C = new float[n][n];

	for (int i = 0; i < n; i++) {
	for (int k = 0; k < n; k++) {
	float aVal = A[i][k];
	int j = 0;

	// Process 8 at a time
	for (; j <= n - 8; j += 8) {
	C[i][j] += aVal * B[k][j];
	C[i][j + 1] += aVal * B[k][j + 1];
	C[i][j + 2] += aVal * B[k][j + 2];
	C[i][j + 3] += aVal * B[k][j + 3];
	C[i][j + 4] += aVal * B[k][j + 4];
	C[i][j + 5] += aVal * B[k][j + 5];
	C[i][j + 6] += aVal * B[k][j + 6];
	C[i][j + 7] += aVal * B[k][j + 7];
	}

	// Remainder
	for (; j < n; j++) {
	C[i][j] += aVal * B[k][j];
	}
	}
	}
	return C;
	}

	static float[][] randomMatrix(int rows, int cols) {
	float[][] matrix = new float[rows][cols];
	for (int i = 0; i < rows; i++) {
	for (int j = 0; j < cols; j++) {
	matrix[i][j] = (float) Math.random();
	}
	}
	return matrix;
	}
	}