Skip to content

Instantly share code, notes, and snippets.

@corporatepiyush
Last active October 16, 2025 08:11
Show Gist options
  • Save corporatepiyush/12cb6b5d7864ce7a4f06361db3ea1298 to your computer and use it in GitHub Desktop.
Save corporatepiyush/12cb6b5d7864ce7a4f06361db3ea1298 to your computer and use it in GitHub Desktop.
Matrix Multiplication
// blas.dart
import 'dart:ffi' as ffi;
import 'dart:typed_data';
import 'package:ffi/ffi.dart'; // Need this package!
// Define the cblas_sgemm signature
typedef CblasSgemmNative =
ffi.Void Function(
ffi.Int32 Order,
ffi.Int32 TransA,
ffi.Int32 TransB,
ffi.Int32 M,
ffi.Int32 N,
ffi.Int32 K,
ffi.Float alpha,
ffi.Pointer<ffi.Float> A,
ffi.Int32 lda,
ffi.Pointer<ffi.Float> B,
ffi.Int32 ldb,
ffi.Float beta,
ffi.Pointer<ffi.Float> C,
ffi.Int32 ldc,
);
typedef CblasSgemmDart =
void Function(
int Order,
int TransA,
int TransB,
int M,
int N,
int K,
double alpha,
ffi.Pointer<ffi.Float> A,
int lda,
ffi.Pointer<ffi.Float> B,
int ldb,
double beta,
ffi.Pointer<ffi.Float> C,
int ldc,
);
void main() {
// Load Accelerate framework
final accelerate = ffi.DynamicLibrary.open(
'/System/Library/Frameworks/Accelerate.framework/Accelerate',
);
// Lookup cblas_sgemm
final cblas_sgemm = accelerate
.lookup<ffi.NativeFunction<CblasSgemmNative>>('cblas_sgemm')
.asFunction<CblasSgemmDart>();
const n = 1024;
const size = n * n;
// Allocate memory using calloc (from package:ffi)
final A = calloc<ffi.Float>(size);
final B = calloc<ffi.Float>(size);
final C = calloc<ffi.Float>(size);
// Fill with random data
for (int i = 0; i < size; i++) {
A[i] = 0.5;
B[i] = 0.5;
}
final stopwatch = Stopwatch()..start();
cblas_sgemm(
101, // CblasRowMajor
111, // CblasNoTrans
111, // CblasNoTrans
n,
n,
n,
1.0,
A,
n,
B,
n,
0.0,
C,
n,
);
stopwatch.stop();
print(
'Dart + Accelerate: ${(stopwatch.elapsedMicroseconds / 1000000).toStringAsFixed(3)}s',
);
// Cleanup
calloc.free(A);
calloc.free(B);
calloc.free(C);
}
package main
/*
#cgo LDFLAGS: -framework Accelerate
#include <Accelerate/Accelerate.h>
*/
import "C"
import (
"fmt"
"math/rand"
"time"
"unsafe"
)
func multiplyBLAS(A, B, C []float32, n int) {
// Convert Go slices to C pointers
pA := (*C.float)(unsafe.Pointer(&A[0]))
pB := (*C.float)(unsafe.Pointer(&B[0]))
pC := (*C.float)(unsafe.Pointer(&C[0]))
// Call cblas_sgemm from Accelerate
C.cblas_sgemm(
C.CblasRowMajor,
C.CblasNoTrans,
C.CblasNoTrans,
C.int(n),
C.int(n),
C.int(n),
1.0,
pA,
C.int(n),
pB,
C.int(n),
0.0,
pC,
C.int(n),
)
}
func main() {
n := 1024
A := make([]float32, n*n)
B := make([]float32, n*n)
C := make([]float32, n*n)
// Fill with random data
for i := range A {
A[i] = rand.Float32()
B[i] = rand.Float32()
}
start := time.Now()
multiplyBLAS(A, B, C, n)
elapsed := time.Since(start)
fmt.Printf("Go + Accelerate: %.3fs\n", elapsed.Seconds())
}
import { dlopen, FFIType, suffix } from 'bun:ffi';
// Load Accelerate framework
const lib = dlopen('/System/Library/Frameworks/Accelerate.framework/Accelerate', {
cblas_sgemm: {
args: [
FFIType.i32, // Order
FFIType.i32, // TransA
FFIType.i32, // TransB
FFIType.i32, // M
FFIType.i32, // N
FFIType.i32, // K
FFIType.f32, // alpha
FFIType.ptr, // A
FFIType.i32, // lda
FFIType.ptr, // B
FFIType.i32, // ldb
FFIType.f32, // beta
FFIType.ptr, // C
FFIType.i32, // ldc
],
returns: FFIType.void,
},
});
function multiplyBLAS(n) {
const size = n * n;
// Allocate aligned memory
const A = new Float32Array(size);
const B = new Float32Array(size);
const C = new Float32Array(size);
// Fill with random data
for (let i = 0; i < size; i++) {
A[i] = Math.random();
B[i] = Math.random();
}
const start = performance.now();
lib.symbols.cblas_sgemm(
101, // CblasRowMajor
111, // CblasNoTrans
111, // CblasNoTrans
n,
n,
n,
1.0,
A,
n,
B,
n,
0.0,
C,
n,
);
const end = performance.now();
console.log(`Bun + Accelerate: ${((end - start) / 1000).toFixed(3)}s`);
}
multiplyBLAS(1024);
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
// REQUIRED for the M1/Apple Silicon native BLAS implementation (Accelerate Framework)
// gcc -O3 -Wall -Wextra Matmul.c -o Matmul -lm -framework Accelerate
#include "cblas.h"
// --- Utility Functions ---
/**
* @brief Finds the minimum of two integers.
*/
int min(int a, int b) {
return (a < b) ? a : b;
}
/**
* @brief Generates a flat, row-major N x M matrix with random floats (0.0 to 1.0).
* @param rows The number of rows (N).
* @param cols The number of columns (M).
* @return A pointer to the allocated float array. Must be freed by the caller.
*/
float *generate_random_matrix(int rows, int cols) {
float *matrix = (float *)malloc(rows * cols * sizeof(float));
if (matrix == NULL) {
perror("Memory allocation failed for matrix");
exit(EXIT_FAILURE);
}
for (int i = 0; i < rows * cols; i++) {
// Generate random float between 0.0 and 1.0
matrix[i] = (float)rand() / (float)RAND_MAX;
}
return matrix;
}
/**
* @brief Creates the transpose of a matrix (B_T from B).
* @param B The original K x M matrix.
* @param K, M The dimensions of B.
* @return A pointer to the allocated M x K transposed float array (B_T).
*/
float *transpose_matrix(const float *B, int K, int M) {
float *B_T = (float *)malloc(M * K * sizeof(float));
if (B_T == NULL) {
perror("Memory allocation failed for transposed matrix");
exit(EXIT_FAILURE);
}
for (int k = 0; k < K; k++) {
for (int m = 0; m < M; m++) {
// B[k][m] goes to B_T[m][k]
B_T[m * K + k] = B[k * M + m];
}
}
return B_T;
}
// --- Matrix Multiplication Versions ---
// Define the function pointer type for matrix multiplication.
// Use '__restrict' keyword for all pointers to assist the M1's compiler (clang).
typedef void (*mm_func)(const float *__restrict, const float *__restrict, float *__restrict, int, int, int, int);
/**
* @brief Version 1: Naive (ijk order) matrix multiplication.
*/
void multiply_naive_ijk(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
(void)tile_size; // Suppress unused parameter warning
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
// A[i][k] * B[k][j]. B access is column-major (cache-unfriendly)
sum += A[i * K + k] * B[k * M + j];
}
C[i * M + j] = sum;
}
}
}
/**
* @brief Version 2: Reordered (ikj order - cache friendly for A and C).
*/
void multiply_reordered_ikj(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
(void)tile_size; // Suppress unused parameter warning
for (int i = 0; i < N; i++) {
int c_row_offset = i * M;
for (int k = 0; k < K; k++) {
// Load A[i][k] once per inner loop
float a_val = A[i * K + k];
int b_row_offset = k * M;
// Inner loop iterates over B row and C row (cache-friendly strides)
for (int j = 0; j < M; j++) {
C[c_row_offset + j] += a_val * B[b_row_offset + j];
}
}
}
}
/**
* @brief Version 3: Tiled/Blocked matrix multiplication (ikj order inside blocks).
* Exploits L1/L2 cache locality with block size.
* @param TILE_SIZE The size of the square tile to use.
*/
void multiply_tiled(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int TILE_SIZE) {
for (int i_tile = 0; i_tile < N; i_tile += TILE_SIZE) {
for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE) {
for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE) {
int i_end = min(i_tile + TILE_SIZE, N);
int j_end = min(j_tile + TILE_SIZE, M);
int k_end = min(k_tile + TILE_SIZE, K);
for (int i = i_tile; i < i_end; i++) {
int c_row_offset = i * M;
for (int k = k_tile; k < k_end; k++) {
float a_val = A[i * K + k];
int b_row_offset = k * M;
for (int j = j_tile; j < j_end; j++) {
C[c_row_offset + j] += a_val * B[b_row_offset + j];
}
}
}
}
}
}
}
/**
* @brief Version 4: Transposed Dot Product (A * B_T). M1 Optimized.
* This function expects B to be pre-transposed and K/M to be swapped for B_T.
* @param A Matrix N x K.
* @param B_T Matrix M x K (transposed B).
* @param C Matrix N x M.
* @param N, K, M Dimensions.
*/
void multiply_transposed_dot(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) {
(void)tile_size; // Suppress unused parameter warning
// This turns the inner loop into a highly cache-friendly dot product of:
// Row i of A (length K) and Row j of B_T (which is Column j of B, length K)
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
float sum = 0.0f;
int a_offset = i * K;
int b_t_offset = j * K;
// Inner loop iterates over K (Dot Product) - all accesses are sequential
for (int k = 0; k < K; k++) {
// A[i][k] * B_T[j][k]
sum += A[a_offset + k] * B_T[b_t_offset + k];
}
C[i * M + j] = sum;
}
}
}
/**
* @brief Version 5: Transposed Dot Product with Loop Unrolling (Factor 4).
* Unrolls the innermost dot product loop (k-loop) by a factor of 4 to reduce
* loop overhead and potentially expose more ILP (Instruction-Level Parallelism).
* This is the version you requested to include!
*/
void multiply_transposed_unrolled(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) {
(void)tile_size; // Suppress unused parameter warning
// K_mod is the number of iterations that can be unrolled (must be a multiple of 4)
int K_mod = K - (K % 4);
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
float sum = 0.0f;
int a_offset = i * K;
int b_t_offset = j * K;
int k = 0;
// 1. Unrolled loop (k = 0 to K_mod - 1)
for (; k < K_mod; k += 4) {
// Calculate four multiplications and accumulations in one loop iteration
sum += A[a_offset + k] * B_T[b_t_offset + k];
sum += A[a_offset + k + 1] * B_T[b_t_offset + k + 1];
sum += A[a_offset + k + 2] * B_T[b_t_offset + k + 2];
sum += A[a_offset + k + 3] * B_T[b_t_offset + k + 3];
}
// 2. Cleanup loop (k = K_mod to K - 1) for remaining elements
for (; k < K; k++) {
sum += A[a_offset + k] * B_T[b_t_offset + k];
}
C[i * M + j] = sum;
}
}
}
/**
* @brief Version 6: Apple Accelerate/BLAS Optimized Matmul (cblas_sgemm).
* This is the vendor-optimized method for peak performance on M1/M2/M3 chips.
* This function performs C = alpha * A * B + beta * C.
*/
void multiply_cblas_sgemm(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
(void)tile_size; // Suppress unused parameter warning
const float alpha = 1.0f; // Scale factor for A * B
const float beta = 0.0f; // Scale factor for C (0.0 since C is calloc'd)
// CblasRowMajor: Matrices A, B, and C are stored in row-major order (C-style)
// CblasNoTrans: Neither A nor B is transposed
cblas_sgemm(CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
// M, N, K refer to the dimensions in the CblasRowMajor context:
N, // M: rows of A and C
M, // N: columns of B and C
K, // K: inner dimension (columns of A / rows of B)
alpha,
A,
K, // lda (leading dimension of A): K columns
B,
M, // ldb (leading dimension of B): M columns
beta,
C,
M); // ldc (leading dimension of C): M columns
}
// --- Benchmark Runner ---
/**
* @brief Benchmarks a matrix multiplication function.
* Runs 3 warmup iterations and 5 measured iterations, returning the best time.
* @param name The name of the test.
* @param func The matrix multiplication function to test.
* @param A, B The input matrices.
* @param N, K, M The dimensions.
* @param tile_size The tile size to pass (used only by tiled version).
* @return The best time measured in seconds.
*/
double benchmark_runner(const char *name, mm_func func, const float *A, const float *B, int N, int K, int M, int tile_size) {
// Silence 'unused parameter' warning, as printing is handled in main()
(void)name;
// Warmup (3 times)
for (int i = 0; i < 3; i++) {
// calloc zeroes memory which is necessary for C += ... operations
float *C_dummy = (float *)calloc(N * M, sizeof(float));
if (C_dummy == NULL) {
perror("Warmup memory allocation failed");
exit(EXIT_FAILURE);
}
func(A, B, C_dummy, N, K, M, tile_size);
free(C_dummy);
}
// Measure (take best of 5)
double best_time = 0.0;
for (int i = 0; i < 5; i++) {
// Allocate and zero the result matrix C for each run
float *C = (float *)calloc(N * M, sizeof(float));
if (C == NULL) {
perror("Measurement memory allocation failed");
exit(EXIT_FAILURE);
}
// Use clock() for CPU time measurement
clock_t start = clock();
func(A, B, C, N, K, M, tile_size);
clock_t end = clock();
free(C);
double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC;
if (i == 0 || elapsed_sec < best_time) {
best_time = elapsed_sec;
}
}
return best_time;
}
// --- Main Program ---
int main() {
// Seed the random number generator once
srand((unsigned int)time(NULL));
printf("C17 Matrix Multiplication Benchmark (M1 Pro Optimized)\n");
printf("====================================================\n\n");
int sizes[] = {256, 512, 1024, 2048}; // Added 2048 to push the M1's cache limits
int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
int TILE_SIZE = 48; // Adjusted tile size to be slightly more aggressive for L1/L2 cache
for (int s = 0; s < num_sizes; s++) {
int size = sizes[s];
// For square matrices, N=K=M=size
int N = size;
int K = size;
int M = size;
printf("======================================================================\n");
printf("Matrix size: %d x %d (Total Data: %.1f MB)\n", size, size, (double)(3.0 * size * size * sizeof(float)) / (1024.0 * 1024.0));
printf("======================================================================\n");
// Generate matrices A and B
float *A = generate_random_matrix(N, K);
float *B = generate_random_matrix(K, M);
// Allocate B_T once for transposed benchmarks
printf("\nPreprocessing (Transpose B)... ");
float *B_T = transpose_matrix(B, K, M);
printf("Done.\n");
printf("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):\n\n");
double reordered_time = 0.0;
double current_time = 0.0;
double transposed_time = 0.0;
double unrolled_time = 0.0;
double cblas_time = 0.0;
// 1. Naive (ijk) - Skip for large matrices
if (size <= 512) {
current_time = benchmark_runner("1. Naive (ijk)", multiply_naive_ijk, A, B, N, K, M, 0);
printf("1. Naive (ijk): %.3fs\n", current_time);
} else {
printf("1. Naive (ijk): [skipped - too slow]\n");
}
// 2. Reordered (ikj) - BASELINE
reordered_time = benchmark_runner("2. Reordered (ikj)", multiply_reordered_ikj, A, B, N, K, M, 0);
printf("2. Reordered (ikj): %.3fs\n", reordered_time);
// 3. Tiled/Blocked
current_time = benchmark_runner("3. Tiled", multiply_tiled, A, B, N, K, M, TILE_SIZE);
printf("3. Tiled (%d): %.3fs (%.2fx)\n",
TILE_SIZE, current_time, reordered_time / current_time);
// 4. Transposed Dot Product (using B_T)
transposed_time = benchmark_runner("4. Transposed Dot Product", multiply_transposed_dot, A, B_T, N, K, M, 0);
printf("4. Transposed Dot Product: %.3fs (%.2fx)\n",
transposed_time, reordered_time / transposed_time);
// 5. Transposed Dot Product with Unrolling (using B_T)
unrolled_time = benchmark_runner("5. Transposed Unrolled Dot", multiply_transposed_unrolled, A, B_T, N, K, M, 0);
printf("5. Transposed Unrolled Dot (x4): %.3fs (%.2fx)\n",
unrolled_time, reordered_time / unrolled_time);
// 6. M1 Native BLAS (cblas_sgemm)
cblas_time = benchmark_runner("6. M1 Native BLAS (cblas_sgemm)", multiply_cblas_sgemm, A, B, N, K, M, 0);
printf("6. M1 Native BLAS (cblas_sgemm): %.3fs (%.2fx) <--- NATIVE SPEED\n",
cblas_time, reordered_time / cblas_time);
// Clean up memory for the current size
free(A);
free(B);
free(B_T); // Free the transposed matrix
printf("\n");
}
printf("====================================================\n");
printf("Benchmark Complete.\n");
return 0;
}
import 'dart:math';
import 'dart:typed_data';
import 'dart:io';
// A record to hold the name and time of a benchmark result.
typedef BenchmarkResult = ({String name, double time});
/// Generates a matrix of a given size with random double values.
List<List<double>> randomMatrix(int rows, int cols) {
final random = Random();
return List.generate(
rows,
(_) => List.generate(cols, (_) => random.nextDouble()),
);
}
/// Version 1: Naive (ijk order) matrix multiplication.
List<List<double>> naiveMultiply(List<List<double>> A, List<List<double>> B) {
final n = A.length;
final k = A[0].length;
final m = B[0].length;
final C = List.generate(n, (_) => List.filled(m, 0.0));
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
double sum = 0.0;
for (int kIdx = 0; kIdx < k; kIdx++) {
sum += A[i][kIdx] * B[kIdx][j];
}
C[i][j] = sum;
}
}
return C;
}
/// Version 2: Reordered (ikj order) for better cache performance.
List<List<double>> reorderedMultiply(
List<List<double>> A,
List<List<double>> B,
) {
final n = A.length;
final k = A[0].length;
final m = B[0].length;
final C = List.generate(n, (_) => List.filled(m, 0.0));
for (int i = 0; i < n; i++) {
for (int kIdx = 0; kIdx < k; kIdx++) {
final aVal = A[i][kIdx];
for (int j = 0; j < m; j++) {
C[i][j] += aVal * B[kIdx][j];
}
}
}
return C;
}
/// Version 3: Tiled/Blocked matrix multiplication.
List<List<double>> tiledMultiply(
List<List<double>> A,
List<List<double>> B,
int tileSize,
) {
final n = A.length;
final k = A[0].length;
final m = B[0].length;
final C = List.generate(n, (_) => List.filled(m, 0.0));
for (int iTile = 0; iTile < n; iTile += tileSize) {
for (int jTile = 0; jTile < m; jTile += tileSize) {
for (int kTile = 0; kTile < k; kTile += tileSize) {
final iEnd = min(iTile + tileSize, n);
final jEnd = min(jTile + tileSize, m);
final kEnd = min(kTile + tileSize, k);
for (int i = iTile; i < iEnd; i++) {
for (int kIdx = kTile; kIdx < kEnd; kIdx++) {
final aVal = A[i][kIdx];
for (int j = jTile; j < jEnd; j++) {
C[i][j] += aVal * B[kIdx][j];
}
}
}
}
}
}
return C;
}
/// Version 4: Manual loop unrolling by a factor of 4.
List<List<double>> unrolled4Multiply(
List<List<double>> A,
List<List<double>> B,
) {
final n = A.length;
final k = A[0].length;
final m = B[0].length;
final C = List.generate(n, (_) => List.filled(m, 0.0));
for (int i = 0; i < n; i++) {
for (int kIdx = 0; kIdx < k; kIdx++) {
final aVal = A[i][kIdx];
int j = 0;
// Process 4 elements at a time
for (; j <= m - 4; j += 4) {
C[i][j] += aVal * B[kIdx][j];
C[i][j + 1] += aVal * B[kIdx][j + 1];
C[i][j + 2] += aVal * B[kIdx][j + 2];
C[i][j + 3] += aVal * B[kIdx][j + 3];
}
// Handle the remainder
for (; j < m; j++) {
C[i][j] += aVal * B[kIdx][j];
}
}
}
return C;
}
/// Version 5: Uses flat Float32List to avoid 2D list overhead.
Float32List flatArrayMultiply(
Float32List A,
Float32List B,
int n,
int k,
int m,
) {
final C = Float32List(n * m); // Automatically initialized to 0
for (int i = 0; i < n; i++) {
for (int kIdx = 0; kIdx < k; kIdx++) {
final aVal = A[i * k + kIdx];
final bRowOffset = kIdx * m;
final cRowOffset = i * m;
for (int j = 0; j < m; j++) {
C[cRowOffset + j] += aVal * B[bRowOffset + j];
}
}
}
return C;
}
/// Helper to convert a 2D list to a flat Float32List.
Float32List toFlatList(List<List<double>> matrix) {
final rows = matrix.length;
if (rows == 0) return Float32List(0);
final cols = matrix[0].length;
final flat = Float32List(rows * cols);
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
flat[i * cols + j] = matrix[i][j];
}
}
return flat;
}
/// Runs a function multiple times to allow the JIT compiler to warm up.
void warmupFunction(String name, void Function() fn) {
print(' Warming up: $name...');
const iterations = 10;
for (int i = 0; i < iterations; i++) {
fn();
}
}
/// Benchmarks a function by running it multiple times and taking the minimum time.
BenchmarkResult benchmark(String name, Function fn) {
const iterations = 5;
final times = <double>[];
final stopwatch = Stopwatch();
for (int i = 0; i < iterations; i++) {
stopwatch.start();
fn();
stopwatch.stop();
// Convert microseconds to seconds
times.add(stopwatch.elapsedMicroseconds / 1000000.0);
stopwatch.reset();
}
// Take the minimum time (best run = least interference)
final time = times.reduce(min);
return (name: name, time: time);
}
/// Main test runner with proper warmup.
void runTests() {
print('Dart Matrix Multiplication - PROPER WARMUP VERSION\n');
print('Runtime: Dart ${Platform.version.split(' ').first}');
print('Platform: ${Platform.operatingSystem}\n');
final sizes = [256, 512, 1024];
for (final size in sizes) {
print('='.padRight(70, '='));
final mem = (3 * size * size * 4) / (1024 * 1024);
print('Matrix size: ${size} x ${size} (${mem.toStringAsFixed(1)} MB)');
print('='.padRight(70, '='));
final A = randomMatrix(size, size);
final B = randomMatrix(size, size);
final A_flat = toFlatList(A);
final B_flat = toFlatList(B);
print('\n🔥 WARMUP PHASE (10 iterations each):');
// Warmup all functions
if (size <= 512) {
warmupFunction('naiveMultiply', () => naiveMultiply(A, B));
}
warmupFunction('reorderedMultiply', () => reorderedMultiply(A, B));
warmupFunction('tiledMultiply', () => tiledMultiply(A, B, 64));
warmupFunction('unrolled4Multiply', () => unrolled4Multiply(A, B));
warmupFunction(
'flatArrayMultiply',
() => flatArrayMultiply(A_flat, B_flat, size, size, size),
);
print('\n⚡ BENCHMARKING (5 runs, taking best):');
final results = <BenchmarkResult>[];
// Test 1: Naive
if (size <= 512) {
final r1 = benchmark('Naive (ijk)', () => naiveMultiply(A, B));
results.add(r1);
print('1. Naive (ijk): ${r1.time.toStringAsFixed(3)}s');
} else {
print('1. Naive (ijk): [skipped - too slow]');
}
// Test 2: Reordered (Baseline)
final r2 = benchmark('Reordered (ikj)', () => reorderedMultiply(A, B));
results.add(r2);
print('2. Reordered (ikj): ${r2.time.toStringAsFixed(3)}s');
final baseTime = r2.time;
// Test 3: Tiled
const tileSize = 64;
final r3 = benchmark(
'Tiled (${tileSize}x$tileSize)',
() => tiledMultiply(A, B, tileSize),
);
results.add(r3);
print(
'3. Tiled ($tileSize): ${r3.time.toStringAsFixed(3)}s (${(baseTime / r3.time).toStringAsFixed(2)}x)',
);
// Test 4: Unrolled
final r4 = benchmark('Unrolled 4x', () => unrolled4Multiply(A, B));
results.add(r4);
print(
'4. Unrolled 4x: ${r4.time.toStringAsFixed(3)}s (${(baseTime / r4.time).toStringAsFixed(2)}x)',
);
// Test 5: Flat arrays (TypedData)
final r5 = benchmark(
'Flat Float32List',
() => flatArrayMultiply(A_flat, B_flat, size, size, size),
);
results.add(r5);
print(
'5. Flat Float32List: ${r5.time.toStringAsFixed(3)}s (${(baseTime / r5.time).toStringAsFixed(2)}x)',
);
// Find the best performing version
final best = results.reduce((a, b) => a.time < b.time ? a : b);
final baseline = results.firstWhere((r) => r.name == 'Reordered (ikj)');
print('\n✨ Winner: ${best.name} (${best.time.toStringAsFixed(3)}s)');
print(
'📊 Speedup vs Reordered: ${(baseline.time / best.time).toStringAsFixed(2)}x\n',
);
}
}
void main() {
runTests();
}
package main
import (
"fmt"
"math/rand"
"strings"
"time"
)
// Generate random matrix
func randomMatrix(rows, cols int) [][]float32 {
matrix := make([][]float32, rows)
for i := range matrix {
matrix[i] = make([]float32, cols)
for j := range matrix[i] {
matrix[i][j] = rand.Float32()
}
}
return matrix
}
// Version 1: Naive (ijk order)
func naiveMultiply(A, B [][]float32) [][]float32 {
n := len(A)
k := len(A[0])
m := len(B[0])
C := make([][]float32, n)
for i := range C {
C[i] = make([]float32, m)
}
for i := 0; i < n; i++ {
for j := 0; j < m; j++ {
var sum float32
for kIdx := 0; kIdx < k; kIdx++ {
sum += A[i][kIdx] * B[kIdx][j]
}
C[i][j] = sum
}
}
return C
}
// Version 2: Reordered (ikj order - cache friendly)
func reorderedMultiply(A, B [][]float32) [][]float32 {
n := len(A)
k := len(A[0])
m := len(B[0])
C := make([][]float32, n)
for i := range C {
C[i] = make([]float32, m)
}
for i := 0; i < n; i++ {
for kIdx := 0; kIdx < k; kIdx++ {
aVal := A[i][kIdx]
for j := 0; j < m; j++ {
C[i][j] += aVal * B[kIdx][j]
}
}
}
return C
}
// Version 3: Tiled/Blocked
func tiledMultiply(A, B [][]float32, tileSize int) [][]float32 {
n := len(A)
k := len(A[0])
m := len(B[0])
C := make([][]float32, n)
for i := range C {
C[i] = make([]float32, m)
}
for iTile := 0; iTile < n; iTile += tileSize {
for jTile := 0; jTile < m; jTile += tileSize {
for kTile := 0; kTile < k; kTile += tileSize {
iEnd := min(iTile+tileSize, n)
jEnd := min(jTile+tileSize, m)
kEnd := min(kTile+tileSize, k)
for i := iTile; i < iEnd; i++ {
for kIdx := kTile; kIdx < kEnd; kIdx++ {
aVal := A[i][kIdx]
for j := jTile; j < jEnd; j++ {
C[i][j] += aVal * B[kIdx][j]
}
}
}
}
}
}
return C
}
// Version 4: Manual unroll 4x
func unrolled4Multiply(A, B [][]float32) [][]float32 {
n := len(A)
k := len(A[0])
m := len(B[0])
C := make([][]float32, n)
for i := range C {
C[i] = make([]float32, m)
}
for i := 0; i < n; i++ {
for kIdx := 0; kIdx < k; kIdx++ {
aVal := A[i][kIdx]
j := 0
// Process 4 at a time
for ; j <= m-4; j += 4 {
C[i][j] += aVal * B[kIdx][j]
C[i][j+1] += aVal * B[kIdx][j+1]
C[i][j+2] += aVal * B[kIdx][j+2]
C[i][j+3] += aVal * B[kIdx][j+3]
}
// Handle remainder
for ; j < m; j++ {
C[i][j] += aVal * B[kIdx][j]
}
}
}
return C
}
// Version 5: Flat slice (single allocation)
func flatSliceMultiply(A, B []float32, n, k, m int) []float32 {
C := make([]float32, n*m)
for i := 0; i < n; i++ {
for kIdx := 0; kIdx < k; kIdx++ {
aVal := A[i*k+kIdx]
bRowOffset := kIdx * m
cRowOffset := i * m
for j := 0; j < m; j++ {
C[cRowOffset+j] += aVal * B[bRowOffset+j]
}
}
}
return C
}
// Version 6: Flat slice with unrolling
func flatSliceUnrolled(A, B []float32, n, k, m int) []float32 {
C := make([]float32, n*m)
for i := 0; i < n; i++ {
for kIdx := 0; kIdx < k; kIdx++ {
aVal := A[i*k+kIdx]
bRowOffset := kIdx * m
cRowOffset := i * m
j := 0
// Unroll by 8
for ; j <= m-8; j += 8 {
C[cRowOffset+j] += aVal * B[bRowOffset+j]
C[cRowOffset+j+1] += aVal * B[bRowOffset+j+1]
C[cRowOffset+j+2] += aVal * B[bRowOffset+j+2]
C[cRowOffset+j+3] += aVal * B[bRowOffset+j+3]
C[cRowOffset+j+4] += aVal * B[bRowOffset+j+4]
C[cRowOffset+j+5] += aVal * B[bRowOffset+j+5]
C[cRowOffset+j+6] += aVal * B[bRowOffset+j+6]
C[cRowOffset+j+7] += aVal * B[bRowOffset+j+7]
}
// Handle remainder
for ; j < m; j++ {
C[cRowOffset+j] += aVal * B[bRowOffset+j]
}
}
}
return C
}
// Helper to convert 2D to flat
func toFlat(matrix [][]float32) []float32 {
rows := len(matrix)
cols := len(matrix[0])
flat := make([]float32, rows*cols)
for i := 0; i < rows; i++ {
for j := 0; j < cols; j++ {
flat[i*cols+j] = matrix[i][j]
}
}
return flat
}
// Benchmark helper that returns time
func benchmark(name string, fn func()) time.Duration {
// Warmup
for i := 0; i < 3; i++ {
fn()
}
// Measure (take best of 5)
var bestTime time.Duration
for i := 0; i < 5; i++ {
start := time.Now()
fn()
elapsed := time.Since(start)
if bestTime == 0 || elapsed < bestTime {
bestTime = elapsed
}
}
return bestTime
}
func main() {
fmt.Println("Go 1.25 Matrix Multiplication Benchmark")
fmt.Println("========================================\n")
sizes := []int{256, 512, 1024}
for _, size := range sizes {
fmt.Println(strings.Repeat("=", 70))
fmt.Printf("Matrix size: %d x %d (%.1f MB)\n", size, size, float64(3*size*size*4)/(1024*1024))
fmt.Println(strings.Repeat("=", 70))
A := randomMatrix(size, size)
B := randomMatrix(size, size)
AFlat := toFlat(A)
BFlat := toFlat(B)
fmt.Println("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):")
fmt.Println()
// Test naive (skip for large)
if size <= 512 {
t := benchmark("1. Naive (ijk)", func() {
_ = naiveMultiply(A, B)
})
fmt.Printf("1. Naive (ijk): %.3fs\n", t.Seconds())
} else {
fmt.Println("1. Naive (ijk): [skipped - too slow]")
}
// Test reordered (baseline)
reorderedTime := benchmark("2. Reordered (ikj)", func() {
_ = reorderedMultiply(A, B)
})
fmt.Printf("2. Reordered (ikj): %.3fs\n", reorderedTime.Seconds())
// Test tiled
tileSize := 64
tiledTime := benchmark(fmt.Sprintf("3. Tiled (%d)", tileSize), func() {
_ = tiledMultiply(A, B, tileSize)
})
fmt.Printf("3. Tiled (%d): %.3fs (%.2fx)\n",
tileSize, tiledTime.Seconds(), reorderedTime.Seconds()/tiledTime.Seconds())
// Test unrolled 4x
unrolled4Time := benchmark("4. Unrolled 4x (2D slices)", func() {
_ = unrolled4Multiply(A, B)
})
fmt.Printf("4. Unrolled 4x (2D slices): %.3fs (%.2fx)\n",
unrolled4Time.Seconds(), reorderedTime.Seconds()/unrolled4Time.Seconds())
// Test flat slice
flatTime := benchmark("5. Flat slice", func() {
_ = flatSliceMultiply(AFlat, BFlat, size, size, size)
})
fmt.Printf("5. Flat slice: %.3fs (%.2fx)\n",
flatTime.Seconds(), reorderedTime.Seconds()/flatTime.Seconds())
// Test flat slice unrolled
flatUnrolledTime := benchmark("6. Flat slice + unroll 8x", func() {
_ = flatSliceUnrolled(AFlat, BFlat, size, size, size)
})
fmt.Printf("6. Flat slice + unroll 8x: %.3fs (%.2fx)\n",
flatUnrolledTime.Seconds(), reorderedTime.Seconds()/flatUnrolledTime.Seconds())
// Find winner
times := map[string]time.Duration{
"Reordered": reorderedTime,
"Tiled": tiledTime,
"Unrolled 4x": unrolled4Time,
"Flat slice": flatTime,
"Flat + unroll 8x": flatUnrolledTime,
}
var bestName string
var bestTime time.Duration
for name, t := range times {
if bestTime == 0 || t < bestTime {
bestTime = t
bestName = name
}
}
fmt.Printf("\n✨ Winner: %s (%.3fs)\n", bestName, bestTime.Seconds())
fmt.Printf("📊 Speedup vs Reordered: %.2fx\n\n", reorderedTime.Seconds()/bestTime.Seconds())
}
}
public class Matmul {
public static void main(String[] args) {
System.out.println("Matrix Multiplication - Optimized for M1 (128-byte cache lines)\n");
// Cache line holds 128 bytes / 4 bytes per float = 32 floats
final int FLOATS_PER_CACHE_LINE = 32;
System.out.println("M1 Architecture:");
System.out.println(" Cache line size: 128 bytes");
System.out.println(" Floats per cache line: " + FLOATS_PER_CACHE_LINE);
System.out.println();
int[] sizes = { 512, 1024, 2048 };
for (int size : sizes) {
System.out.println("=".repeat(70));
System.out.printf("Matrix size: %d x %d (%.1f MB total)\n", size, size, (3.0 * size * size * 4) / (1024 * 1024));
System.out.println("=".repeat(70));
float[][] A = randomMatrix(size, size);
float[][] B = randomMatrix(size, size);
// 1. Naive (skip for large matrices)
if (size <= 512) {
long start = System.nanoTime();
float[][] C1 = naiveMultiply(A, B);
double naiveTime = (System.nanoTime() - start) / 1e9;
System.out.printf("1. Naive: %.3f seconds (baseline)\n", naiveTime);
} else {
System.out.println("1. Naive: [skipped - too slow!]");
}
// 2. Reordered
long start = System.nanoTime();
float[][] C2 = reorderedMultiply(A, B);
double reorderedTime = (System.nanoTime() - start) / 1e9;
System.out.printf("2. Reordered: %.3f seconds\n", reorderedTime);
// 3. Cache-line aligned (process 32 floats at a time)
start = System.nanoTime();
float[][] C3 = cacheLineAligned(A, B);
double alignedTime = (System.nanoTime() - start) / 1e9;
System.out.printf("3. Cache-line aligned: %.3f seconds (%.2fx vs reordered)\n", alignedTime, reorderedTime / alignedTime);
// 4. Try different tile sizes (M1 has huge caches!)
System.out.println("4. Tiled versions:");
int[] tileSizes = { 64, 128, 256, 512 };
double bestTime = Double.MAX_VALUE;
int bestTile = 0;
for (int tileSize : tileSizes) {
start = System.nanoTime();
float[][] C4 = tiledMultiply(A, B, tileSize);
double tiledTime = (System.nanoTime() - start) / 1e9;
System.out.printf(" Tile size %4d: %.3f seconds (%.2fx vs reordered)\n", tileSize, tiledTime, reorderedTime / tiledTime);
if (tiledTime < bestTime) {
bestTime = tiledTime;
bestTile = tileSize;
}
}
System.out.printf("\n✨ Best configuration: Tile size %d (%.2fx faster than reordered)\n\n", bestTile, reorderedTime / bestTime);
}
}
// NAIVE
static float[][] naiveMultiply(float[][] A, float[][] B) {
int n = A.length,
k = A[0].length,
m = B[0].length;
float[][] C = new float[n][m];
for (int row = 0; row < n; row++) {
for (int col = 0; col < m; col++) {
for (int i = 0; i < k; i++) {
C[row][col] += A[row][i] * B[i][col];
}
}
}
return C;
}
// REORDERED
static float[][] reorderedMultiply(float[][] A, float[][] B) {
int n = A.length,
k = A[0].length,
m = B[0].length;
float[][] C = new float[n][m];
for (int row = 0; row < n; row++) {
for (int i = 0; i < k; i++) {
float aValue = A[row][i];
for (int col = 0; col < m; col++) {
C[row][col] += aValue * B[i][col];
}
}
}
return C;
}
// CACHE-LINE ALIGNED: Process 32 floats at once (128 bytes)
static float[][] cacheLineAligned(float[][] A, float[][] B) {
int n = A.length,
k = A[0].length,
m = B[0].length;
float[][] C = new float[n][m];
final int BLOCK = 32; // 32 floats = 128 bytes = 1 cache line on M1
for (int row = 0; row < n; row++) {
for (int i = 0; i < k; i++) {
float aValue = A[row][i];
// Process 32 columns at a time (1 cache line)
int col = 0;
for (; col <= m - BLOCK; col += BLOCK) {
// These 32 accesses hit the same cache line!
for (int j = 0; j < BLOCK; j++) {
C[row][col + j] += aValue * B[i][col + j];
}
}
// Handle remaining columns
for (; col < m; col++) {
C[row][col] += aValue * B[i][col];
}
}
}
return C;
}
// TILED
static float[][] tiledMultiply(float[][] A, float[][] B, int tileSize) {
int n = A.length,
k = A[0].length,
m = B[0].length;
float[][] C = new float[n][m];
for (int rowTile = 0; rowTile < n; rowTile += tileSize) {
for (int colTile = 0; colTile < m; colTile += tileSize) {
for (int innerTile = 0; innerTile < k; innerTile += tileSize) {
int rowEnd = Math.min(rowTile + tileSize, n);
int colEnd = Math.min(colTile + tileSize, m);
int innerEnd = Math.min(innerTile + tileSize, k);
for (int row = rowTile; row < rowEnd; row++) {
for (int i = innerTile; i < innerEnd; i++) {
float aValue = A[row][i];
for (int col = colTile; col < colEnd; col++) {
C[row][col] += aValue * B[i][col];
}
}
}
}
}
}
return C;
}
static float[][] randomMatrix(int rows, int cols) {
float[][] matrix = new float[rows][cols];
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
matrix[i][j] = (float) Math.random();
}
}
return matrix;
}
}
// Matrix Multiplication with PROPER WARMUP
function randomMatrix(rows, cols) {
const matrix = [];
for (let i = 0; i < rows; i++) {
matrix[i] = [];
for (let j = 0; j < cols; j++) {
matrix[i][j] = Math.random();
}
}
return matrix;
}
// Version 1: Naive (ijk order)
function naiveMultiply(A, B) {
const n = A.length;
const k = A[0].length;
const m = B[0].length;
const C = Array(n)
.fill(0)
.map(() => Array(m).fill(0));
for (let i = 0; i < n; i++) {
for (let j = 0; j < m; j++) {
let sum = 0;
for (let k_idx = 0; k_idx < k; k_idx++) {
sum += A[i][k_idx] * B[k_idx][j];
}
C[i][j] = sum;
}
}
return C;
}
// Version 2: Reordered (ikj order - cache friendly)
function reorderedMultiply(A, B) {
const n = A.length;
const k = A[0].length;
const m = B[0].length;
const C = Array(n)
.fill(0)
.map(() => Array(m).fill(0));
for (let i = 0; i < n; i++) {
for (let k_idx = 0; k_idx < k; k_idx++) {
const aVal = A[i][k_idx];
for (let j = 0; j < m; j++) {
C[i][j] += aVal * B[k_idx][j];
}
}
}
return C;
}
// Version 3: Tiled/Blocked
function tiledMultiply(A, B, tileSize) {
const n = A.length;
const k = A[0].length;
const m = B[0].length;
const C = Array(n)
.fill(0)
.map(() => Array(m).fill(0));
for (let iTile = 0; iTile < n; iTile += tileSize) {
for (let jTile = 0; jTile < m; jTile += tileSize) {
for (let kTile = 0; kTile < k; kTile += tileSize) {
const iEnd = Math.min(iTile + tileSize, n);
const jEnd = Math.min(jTile + tileSize, m);
const kEnd = Math.min(kTile + tileSize, k);
for (let i = iTile; i < iEnd; i++) {
for (let k_idx = kTile; k_idx < kEnd; k_idx++) {
const aVal = A[i][k_idx];
for (let j = jTile; j < jEnd; j++) {
C[i][j] += aVal * B[k_idx][j];
}
}
}
}
}
}
return C;
}
// Version 4: Manual unroll 4x
function unrolled4Multiply(A, B) {
const n = A.length;
const k = A[0].length;
const m = B[0].length;
const C = Array(n)
.fill(0)
.map(() => Array(m).fill(0));
for (let i = 0; i < n; i++) {
for (let k_idx = 0; k_idx < k; k_idx++) {
const aVal = A[i][k_idx];
let j = 0;
// Process 4 at a time
for (; j <= m - 4; j += 4) {
C[i][j] += aVal * B[k_idx][j];
C[i][j + 1] += aVal * B[k_idx][j + 1];
C[i][j + 2] += aVal * B[k_idx][j + 2];
C[i][j + 3] += aVal * B[k_idx][j + 3];
}
// Handle remainder
for (; j < m; j++) {
C[i][j] += aVal * B[k_idx][j];
}
}
}
return C;
}
// Version 5: Flat arrays (avoid 2D array overhead)
function flatArrayMultiply(A, B, n, k, m) {
const C = new Float32Array(n * m);
for (let i = 0; i < n; i++) {
for (let k_idx = 0; k_idx < k; k_idx++) {
const aVal = A[i * k + k_idx];
const bRowOffset = k_idx * m;
const cRowOffset = i * m;
for (let j = 0; j < m; j++) {
C[cRowOffset + j] += aVal * B[bRowOffset + j];
}
}
}
return C;
}
// Helper to convert 2D to flat
function toFlatArray(matrix) {
const rows = matrix.length;
const cols = matrix[0].length;
const flat = new Float32Array(rows * cols);
for (let i = 0; i < rows; i++) {
for (let j = 0; j < cols; j++) {
flat[i * cols + j] = matrix[i][j];
}
}
return flat;
}
// PROPER warmup function
function warmupFunction(fn, ...args) {
console.log(` Warming up: ${fn.name}...`);
const iterations = 10;
for (let i = 0; i < iterations; i++) {
fn(...args);
}
}
// Benchmark function with minimal overhead
function benchmark(name, fn, ...args) {
const iterations = 5;
const times = [];
for (let i = 0; i < iterations; i++) {
const start = performance.now();
const result = fn(...args);
const end = performance.now();
times.push((end - start) / 1000);
}
// Take the minimum time (best run = least interference)
const time = Math.min(...times);
return { name, time };
}
// Main test with proper warmup
function runTests() {
console.log('JavaScript Matrix Multiplication - PROPER WARMUP VERSION\n');
console.log(`Runtime: ${typeof process !== 'undefined' ? 'Node.js ' + process.version : 'Browser'}`);
console.log(`Platform: ${typeof process !== 'undefined' ? process.platform + ' ' + process.arch : 'Browser'}\n`);
const sizes = [256, 512, 1024];
for (const size of sizes) {
console.log('='.repeat(70));
console.log(`Matrix size: ${size} x ${size} (${((3 * size * size * 4) / (1024 * 1024)).toFixed(1)} MB)`);
console.log('='.repeat(70));
const A = randomMatrix(size, size);
const B = randomMatrix(size, size);
const A_flat = toFlatArray(A);
const B_flat = toFlatArray(B);
console.log('\n🔥 WARMUP PHASE (10 iterations each):');
// Warmup all functions
if (size <= 512) {
warmupFunction(naiveMultiply, A, B);
}
warmupFunction(reorderedMultiply, A, B);
warmupFunction(tiledMultiply, A, B, 64);
warmupFunction(unrolled4Multiply, A, B);
warmupFunction(flatArrayMultiply, A_flat, B_flat, size, size, size);
console.log('\n⚡ BENCHMARKING (5 runs, taking best):');
const results = [];
// Test 1: Naive
if (size <= 512) {
const r1 = benchmark('Naive (ijk)', naiveMultiply, A, B);
results.push(r1);
console.log(`1. Naive (ijk): ${r1.time.toFixed(3)}s`);
} else {
console.log(`1. Naive (ijk): [skipped - too slow]`);
}
// Test 2: Reordered
const r2 = benchmark('Reordered (ikj)', reorderedMultiply, A, B);
results.push(r2);
console.log(`2. Reordered (ikj): ${r2.time.toFixed(3)}s`);
const baseTime = r2.time;
// Test 3: Tiled
const tileSize = 64;
const r3 = benchmark(`Tiled (${tileSize}x${tileSize})`, tiledMultiply, A, B, tileSize);
results.push(r3);
console.log(`3. Tiled (${tileSize}): ${r3.time.toFixed(3)}s (${(baseTime / r3.time).toFixed(2)}x)`);
// Test 4: Unrolled
const r4 = benchmark('Unrolled 4x', unrolled4Multiply, A, B);
results.push(r4);
console.log(`4. Unrolled 4x: ${r4.time.toFixed(3)}s (${(baseTime / r4.time).toFixed(2)}x)`);
// Test 5: Flat arrays (TypedArray)
const r5 = benchmark('Flat Float32Array', flatArrayMultiply, A_flat, B_flat, size, size, size);
results.push(r5);
console.log(`5. Flat Float32Array: ${r5.time.toFixed(3)}s (${(baseTime / r5.time).toFixed(2)}x)`);
// Find best
const best = results.reduce((a, b) => (a.time < b.time ? a : b));
const baseline = results.find(r => r.name === 'Reordered (ikj)');
console.log(`\n✨ Winner: ${best.name} (${best.time.toFixed(3)}s)`);
console.log(`📊 Speedup vs Reordered: ${(baseline.time / best.time).toFixed(2)}x\n`);
}
}
// Run the tests
runTests();
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <pthread.h>
#include <stdbool.h>
#include <string.h>
#include <stdatomic.h> // Minimal synchronization: Atomic counter for job completion
#include <sched.h> // For sched_yield()
#include "cblas.h"
// Check for NEON SIMD
#ifdef __ARM_NEON
#include <arm_neon.h>
#define USE_NEON_SIMD 1
#else
#define USE_NEON_SIMD 0
#endif
// --- Constants and Global Settings ---
#define TILE_SIZE 64
#define STRASSEN_CUTOFF 64
#define MAX_THREADS 8
#define VECTOR_LANE_COUNT 4 // Floats per NEON register
// --- Type Definitions ---
// Function signature for the execution block of work (used by workers)
typedef void (*mm_block_func)(const float *A, const float *B, float *C, int N, int K, int M, int row_start, int row_end, int tile_size);
// Function signature for the top-level benchmark runner (used by main)
typedef void (*mm_func)(const float *__restrict, const float *__restrict, float *__restrict, int, int, int, int, const float *__restrict);
/**
* @brief Arguments passed to each persistent worker thread.
*/
typedef struct MatmulTaskArgs {
int thread_id;
const float *A;
const float *B;
float *C;
int N, K, M;
int row_start;
int row_end;
int tile_size;
mm_block_func base_func;
} MatmulTaskArgs;
// --- Global Thread Pool (Minimal Synchronization using Atomicity) ---
pthread_t g_workers[MAX_THREADS];
MatmulTaskArgs g_args[MAX_THREADS];
// Shared state for all workers to check for a new job or shutdown.
atomic_bool g_running; // Set to false for shutdown
atomic_int g_workers_ready; // Initial count: workers are ready to enter the job loop
atomic_int g_workers_completed; // Counts workers finished the current job
atomic_int g_job_counter; // THE FIX: Incremented by main thread to signal a NEW job
int g_num_workers = 0; // Actual number of threads created
// --- Utility Functions (Declarations) ---
int min(int a, int b);
float *generate_random_matrix(int rows, int cols);
float *transpose_matrix(const float *B, int K, int M);
double benchmark_runner(const char *name, mm_func mm_function, const float *A, const float *B, const float *B_T, int N, int K, int M, int tile_size);
void multiply_naive_ijk_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
void multiply_reordered_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
void multiply_tiled_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int TILE_SIZE_ARG);
void multiply_optimized_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
void multiply_simd_transposed_block(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size);
// Sequential wrappers (Declarations)
void multiply_v2_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T);
void multiply_v3_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T);
void multiply_v11_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T);
void multiply_cblas_sgemm_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T);
void multiply_strassen_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T);
// --- Thread Pool Worker Function ---
/**
* @brief The persistent worker thread loop. Waits for g_job_counter to increment.
*/
void* thread_pool_worker(void *arg) {
MatmulTaskArgs *w_args = (MatmulTaskArgs *)arg;
int local_job_id = 0; // Tracks the last job ID this worker processed
// Initial signal: Tell the main thread this worker is ready to wait.
atomic_fetch_add(&g_workers_ready, 1);
while (atomic_load(&g_running)) {
// --- 1. WAIT FOR NEW JOB SIGNAL ---
// Wait until the global job counter is higher than the last job we processed.
while (local_job_id == atomic_load(&g_job_counter) && atomic_load(&g_running)) {
sched_yield(); // Yield CPU time
}
if (!atomic_load(&g_running)) break;
// Get the new job ID BEFORE executing the task
local_job_id = atomic_load(&g_job_counter);
// --- 2. EXECUTE TASK (Arguments are guaranteed to be updated by main thread) ---
if (w_args->base_func) {
w_args->base_func(w_args->A, w_args->B, w_args->C, w_args->N, w_args->K, w_args->M,
w_args->row_start, w_args->row_end, w_args->tile_size);
}
// --- 3. SIGNAL COMPLETION ---
atomic_fetch_add(&g_workers_completed, 1);
}
return NULL;
}
// --- Thread Pool Management Functions ---
/**
* @brief Initializes the thread pool: creates and starts all worker threads.
*/
void initialize_thread_pool(int N) {
g_num_workers = min(N, MAX_THREADS);
// Initialize atomic variables
atomic_store(&g_running, true);
atomic_init(&g_workers_ready, 0);
atomic_init(&g_workers_completed, 0);
atomic_init(&g_job_counter, 0);
for (int i = 0; i < g_num_workers; i++) {
g_args[i].thread_id = i;
g_args[i].base_func = NULL;
if (pthread_create(&g_workers[i], NULL, thread_pool_worker, &g_args[i]) != 0) {
perror("Failed to create worker thread");
g_num_workers = i;
break;
}
}
// Wait for all workers to reach their initial waiting state
while (atomic_load(&g_workers_ready) < g_num_workers) {
sched_yield();
}
}
/**
* @brief Cleans up the thread pool: signals shutdown and joins all workers.
*/
void cleanup_thread_pool() {
// Signal all workers to shut down
atomic_store(&g_running, false);
// Increment job counter one last time to wake any sleeping threads so they can read the shutdown signal
atomic_fetch_add(&g_job_counter, 1);
for (int i = 0; i < g_num_workers; i++) {
pthread_join(g_workers[i], NULL);
}
g_num_workers = 0;
}
/**
* @brief Dispatches a job to the cached thread pool and waits for completion.
*/
void dispatch_job(mm_block_func block_func, const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) {
if (g_num_workers == 0) return;
// --- 1. SETUP JOB & WORK PARTITIONS ---
int current_row = 0;
int rows_per_thread = N / g_num_workers;
int remaining_rows = N % g_num_workers;
for (int i = 0; i < g_num_workers; i++) {
MatmulTaskArgs *args = &g_args[i];
int row_start = current_row;
int rows_to_process = rows_per_thread + (i < remaining_rows ? 1 : 0);
int row_end = row_start + rows_to_process;
// Update the arguments for the running worker threads
args->A = A;
args->B = B;
args->C = C;
args->N = N;
args->K = K;
args->M = M;
args->row_start = row_start;
args->row_end = row_end;
args->tile_size = tile_size;
args->base_func = block_func;
current_row = row_end;
}
// --- 2. SIGNAL WORK START ---
// Reset the completion counter BEFORE signaling the workers
atomic_store(&g_workers_completed, 0);
// Increment the job counter. This is the atomic signal that wakes the workers.
atomic_fetch_add(&g_job_counter, 1);
// --- 3. WAIT FOR COMPLETION ---
// Wait until the completion counter reaches the total worker count
while (atomic_load(&g_workers_completed) < g_num_workers) {
sched_yield();
}
}
// --- Wrapper Functions (Warnings suppressed) ---
void shared_parallel_wrapper(mm_block_func block_func, const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)B_T;
dispatch_job(block_func, A, B, C, N, K, M, tile_size);
}
void multiply_v1_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size;
shared_parallel_wrapper(multiply_naive_ijk_block, A, B, C, N, K, M, 0, B_T);
}
void multiply_v2_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size;
shared_parallel_wrapper(multiply_reordered_ikj_block, A, B, C, N, K, M, 0, B_T);
}
void multiply_v3_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
shared_parallel_wrapper(multiply_tiled_block, A, B, C, N, K, M, tile_size, B_T);
}
void multiply_v7_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size;
shared_parallel_wrapper(multiply_optimized_ikj_block, A, B, C, N, K, M, 0, B_T);
}
void multiply_v11_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)B;
(void)tile_size;
// Pass B_T as the B argument for the transposed block function
dispatch_job(multiply_simd_transposed_block, A, B_T, C, N, K, M, 0);
}
/**
* @brief Wrapper for OpenBLAS cblas_sgemm (V9 in the overall set).
*/
void multiply_cblas_sgemm_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size; (void)B_T;
// Set to RowMajor order: C(N, M) = A(N, K) * B(K, M)
// Transpose operations are handled internally by cblas_sgemm,
// we assume standard row-major storage (CblasRowMajor).
cblas_sgemm(CblasRowMajor, // Layout
CblasNoTrans, // Transpose A? No.
CblasNoTrans, // Transpose B? No.
N, // Number of rows of A and C (N)
M, // Number of columns of B and C (M)
K, // Number of columns of A and rows of B (K)
1.0f, // Alpha (Scaling factor for A * B)
A, // Matrix A
K, // Leading dimension of A (K)
B, // Matrix B
M, // Leading dimension of B (M)
0.0f, // Beta (Scaling factor for C)
C, // Matrix C
M); // Leading dimension of C (M)
}
// --- Main Program (Driver) ---
int main() {
srand((unsigned int)time(NULL));
printf("C17 Matrix Multiplication Benchmark (11 Versions)\n");
printf("====================================================\n");
printf(" *** FIXED: Lock-Free Race Condition Eliminated ***\n");
printf(" *** Using an Atomic Job Counter for Reliable Synchronization ***\n");
printf("====================================================\n");
int sizes[] = {256, 512, 1024, 2048};
int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
for (int s = 0; s < num_sizes; s++) {
int size = sizes[s];
int N = size;
int K = size;
int M = size;
initialize_thread_pool(N);
printf("======================================================================\n");
printf("Matrix size: %d x %d | Workers Used: %d | TILE_SIZE: %d\n",
size, size, g_num_workers, TILE_SIZE);
printf("SIMD Status: %s\n", USE_NEON_SIMD ? "NEON Active (Manual Intrinsic)" : "SIMD Disabled (C fallback)");
printf("======================================================================\n");
float *A = generate_random_matrix(N, K);
float *B = generate_random_matrix(K, M);
float *B_T = transpose_matrix(B, K, M);
printf("\n🔥 Benchmark (5 measured, best time):\n\n");
double sequential_baseline_time = 0.0;
// 2. Reordered (ikj) - Sequential (V2) - BASELINE
sequential_baseline_time = benchmark_runner("2. Reordered (ikj) Sequential", multiply_v2_sequential_wrapper, A, B, B_T, N, K, M, 0);
printf(" 2. Reordered (ikj) Sequential: %.3fs <--- BASELINE\n", sequential_baseline_time);
// --- Parallel Implementations ---
// 7. Reordered (ikj) Parallel
double v7_time = benchmark_runner("7. Reordered (ikj) Parallel: ", multiply_v2_parallel_wrapper, A, B, B_T, N, K, M, 0);
printf(" 7. Reordered (ikj) Parallel: %.3fs (%.2fx)\n", v7_time, sequential_baseline_time / v7_time);
// 8. Tiled (ijk) Parallel (V3)
double v8_time = benchmark_runner("8. Tiled (ijk) Parallel: ", multiply_v3_parallel_wrapper, A, B, B_T, N, K, M, TILE_SIZE);
printf(" 8. Tiled (ijk) Parallel: %.3fs (%.2fx)\n", v8_time, sequential_baseline_time / v8_time);
// 9. OpenBLAS cblas_sgemm (Library Standard)
double v9_time = benchmark_runner("9. OpenBLAS cblas_sgemm: ", multiply_cblas_sgemm_wrapper, A, B, B_T, N, K, M, 0);
printf(" 9. OpenBLAS cblas_sgemm: %.3fs (%.2fx) - LIBRARY\n", v9_time, sequential_baseline_time / v9_time);
// 11. SIMD + Transposed Parallel (V11)
double v11_time = benchmark_runner("11. SIMD + Transposed Parallel: ", multiply_v11_parallel_wrapper, A, B, B_T, N, K, M, 0);
printf("11. SIMD + Transposed Parallel: %.3fs (%.2fx) - NEON\n", v11_time, sequential_baseline_time / v11_time);
// Clean up memory
free(A);
free(B);
free(B_T);
cleanup_thread_pool();
printf("\n");
}
printf("====================================================\n");
printf("Benchmark Complete.\n");
return 0;
}
// --- Implementation details for Block Functions (Warnings suppressed) ---
void multiply_naive_ijk_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
(void)N;
(void)tile_size;
for (int i = row_start; i < row_end; i++) {
for (int j = 0; j < M; j++) {
float sum = 0.0f;
for (int k = 0; k < K; k++) { sum += A[i * K + k] * B[k * M + j]; }
C[i * M + j] = sum;
}
}
}
void multiply_reordered_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
(void)N;
(void)tile_size;
for (int i = row_start; i < row_end; i++) {
int c_row_offset = i * M;
for (int k = 0; k < K; k++) {
float a_val = A[i * K + k];
int b_row_offset = k * M;
for (int j = 0; j < M; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; }
}
}
}
void multiply_tiled_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int TILE_SIZE_ARG) {
(void)N;
for (int i_tile = row_start; i_tile < row_end; i_tile += TILE_SIZE_ARG) {
for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE_ARG) {
for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE_ARG) {
int i_end = min(i_tile + TILE_SIZE_ARG, row_end);
int j_end = min(j_tile + TILE_SIZE_ARG, M);
int k_end = min(k_tile + TILE_SIZE_ARG, K);
for (int i = i_tile; i < i_end; i++) {
int c_row_offset = i * M;
for (int k = k_tile; k < k_end; k++) {
float a_val = A[i * K + k];
int b_row_offset = k * M;
for (int j = j_tile; j < j_end; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; }
}
}
}
}
}
}
void multiply_optimized_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
(void)N;
(void)tile_size;
for (int i = row_start; i < row_end; i++) {
int c_row_offset = i * M;
for (int k = 0; k < K; k++) {
float a_val = A[i * K + k];
int b_row_offset = k * M;
int j = 0;
// Unrolled loop (V7 uses this)
for (; j + 4 <= M; j += 4) {
C[c_row_offset + j + 0] += a_val * B[b_row_offset + j + 0];
C[c_row_offset + j + 1] += a_val * B[b_row_offset + j + 1];
C[c_row_offset + j + 2] += a_val * B[b_row_offset + j + 2];
C[c_row_offset + j + 3] += a_val * B[b_row_offset + j + 3];
}
for (; j < M; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; }
}
}
}
void multiply_simd_transposed_block(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) {
(void)N;
(void)tile_size;
for (int i = row_start; i < row_end; i++) {
const float *A_row = A + i * K;
float *C_row = C + i * M;
for (int j = 0; j < M; j++) {
const float *B_T_row = B_T + j * K;
float sum = 0.0f;
int k = 0;
#if USE_NEON_SIMD
// NEON Vectorization (4 floats at a time)
float32x4_t dot_sum_v = vdupq_n_f32(0.0f);
for (; k + VECTOR_LANE_COUNT <= K; k += VECTOR_LANE_COUNT) {
// Load 4 floats from A row
float32x4_t a_v = vld1q_f32(A_row + k);
// Load 4 floats from B_T row (which is contiguous after transpose)
float32x4_t b_t_v = vld1q_f32(B_T_row + k);
// Multiply-accumulate: dot_sum_v = (a_v * b_t_v) + dot_sum_v
dot_sum_v = vmlaq_f32(dot_sum_v, a_v, b_t_v);
}
// Horizontal sum: Sum up the 4 elements in the accumulator vector
// Use vpadd, vpadd, vgetq_lane, vget_low/high.
// Simplified for modern compilers/intrinsics:
float32x2_t sum_pair = vpadd_f32(vget_low_f32(dot_sum_v), vget_high_f32(dot_sum_v));
sum_pair = vpadd_f32(sum_pair, sum_pair); // Sum the two elements
sum = vget_lane_f32(sum_pair, 0);
#endif // USE_NEON_SIMD
// Scalar Cleanup Loop (Handles remaining elements if K is not a multiple of 4)
for (; k < K; k++) {
sum += A_row[k] * B_T_row[k];
}
C_row[j] = sum;
}
}
}
// --- Implementation details for Utility Functions (Sequential wrappers and others) ---
void multiply_v1_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size; (void)B_T; multiply_naive_ijk_block(A, B, C, N, K, M, 0, N, 0);
}
void multiply_v2_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size; (void)B_T; multiply_reordered_ikj_block(A, B, C, N, K, M, 0, N, 0);
}
void multiply_v3_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)B_T; multiply_tiled_block(A, B, C, N, K, M, 0, N, tile_size);
}
void multiply_v7_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size; (void)B_T; multiply_optimized_ikj_block(A, B, C, N, K, M, 0, N, 0);
}
void multiply_v11_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) {
(void)tile_size; (void)B; multiply_simd_transposed_block(A, B_T, C, N, K, M, 0, N, 0);
}
void multiply_strassen_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T) {
(void)A; (void)B; (void)C; (void)N; (void)K; (void)M; (void)tile_size; (void)B_T;
}
int min(int a, int b) { return (a < b) ? a : b; }
float *generate_random_matrix(int rows, int cols) {
// Ensuring 16-byte alignment for NEON
float *matrix = NULL;
size_t total_bytes = (size_t)rows * cols * sizeof(float);
// Use posix_memalign for 16-byte alignment, required for efficient SIMD loads
if (posix_memalign((void**)&matrix, 16, total_bytes) != 0) {
matrix = (float *)malloc(total_bytes);
if (matrix != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc (alignment may be suboptimal).\n");
}
if (matrix == NULL) { perror("Memory allocation failed"); exit(EXIT_FAILURE); }
for (size_t i = 0; i < (size_t)rows * cols; i++) { matrix[i] = (float)rand() / (float)RAND_MAX; }
return matrix;
}
float *transpose_matrix(const float *B, int K, int M) {
// Ensuring 16-byte alignment for NEON
float *B_T = NULL;
size_t total_bytes = (size_t)M * K * sizeof(float);
if (posix_memalign((void**)&B_T, 16, total_bytes) != 0) {
B_T = (float *)malloc(total_bytes);
if (B_T != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc for transpose (alignment may be suboptimal).\n");
}
if (B_T == NULL) { perror("Memory allocation failed"); exit(EXIT_FAILURE); }
for (int i = 0; i < K; i++) {
for (int j = 0; j < M; j++) { B_T[j * K + i] = B[i * M + j]; }
}
return B_T;
}
double benchmark_runner(const char *name, mm_func mm_function, const float *A, const float *B, const float *B_T, int N, int K, int M, int tile_size) {
(void)name;
double best_time = 0.0;
float *C = NULL;
size_t total_bytes = (size_t)N * M * sizeof(float);
if (posix_memalign((void**)&C, 16, total_bytes) != 0) {
C = (float *)malloc(total_bytes);
if (C != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc for C (alignment may be suboptimal).\n");
}
if (C == NULL) { perror("Measurement memory allocation failed"); exit(EXIT_FAILURE); }
for (int i = 0; i < 5; i++) {
// Must clear C matrix before each run
memset(C, 0, total_bytes);
clock_t start = clock();
mm_function(A, B, C, N, K, M, tile_size, B_T);
clock_t end = clock();
double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC;
if (i == 0 || elapsed_sec < best_time) {
best_time = elapsed_sec;
}
}
free(C);
return best_time;
}
public class VectorizationTest {
public static void main(String[] args) {
System.out.println("Testing Java Auto-Vectorization on M1\n");
int size = 2048;
float[][] A = randomMatrix(size, size);
float[][] B = randomMatrix(size, size);
// Warmup - let JIT do its magic
System.out.println("Warming up JIT compiler...");
for (int i = 0; i < 5; i++) {
scalarVersion(A, B);
vectorFriendly(A, B);
}
System.out.println("Warmup complete!\n");
// Test 1: Scalar-friendly (hard to vectorize)
long start = System.nanoTime();
float[][] C1 = scalarVersion(A, B);
double scalarTime = (System.nanoTime() - start) / 1e9;
System.out.printf("1. Scalar-friendly version: %.3f seconds\n", scalarTime);
// Test 2: Vector-friendly (easy to vectorize)
start = System.nanoTime();
float[][] C2 = vectorFriendly(A, B);
double vectorTime = (System.nanoTime() - start) / 1e9;
System.out.printf("2. Vector-friendly version: %.3f seconds\n", vectorTime);
System.out.printf("\nSpeedup: %.2fx\n", scalarTime / vectorTime);
if (vectorTime < scalarTime * 0.9) {
System.out.println("✅ Likely vectorized! JIT is working!");
} else {
System.out.println("❌ No significant vectorization detected");
}
// Test 3: Manual unrolling
start = System.nanoTime();
float[][] C3 = manualUnroll4(A, B);
double unrollTime = (System.nanoTime() - start) / 1e9;
System.out.printf("\n3. Manual unroll (4x): %.3f seconds (%.2fx)\n", unrollTime, scalarTime / unrollTime);
// Test 4: Aggressive unrolling
start = System.nanoTime();
float[][] C4 = manualUnroll8(A, B);
double unroll8Time = (System.nanoTime() - start) / 1e9;
System.out.printf("4. Manual unroll (8x): %.3f seconds (%.2fx)\n", unroll8Time, scalarTime / unroll8Time);
}
// Version 1: Hard to vectorize (dependencies)
static float[][] scalarVersion(float[][] A, float[][] B) {
int n = A.length;
float[][] C = new float[n][n];
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
float sum = 0;
for (int k = 0; k < n; k++) {
sum += A[i][k] * B[k][j]; // Reduces to single value
}
C[i][j] = sum;
}
}
return C;
}
// Version 2: Easy to vectorize (independent operations)
static float[][] vectorFriendly(float[][] A, float[][] B) {
int n = A.length;
float[][] C = new float[n][n];
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
float aVal = A[i][k];
// This inner loop is vectorizable!
// Each C[i][j] update is independent
for (int j = 0; j < n; j++) {
C[i][j] += aVal * B[k][j];
}
}
}
return C;
}
// Version 3: Manual unroll 4x
static float[][] manualUnroll4(float[][] A, float[][] B) {
int n = A.length;
float[][] C = new float[n][n];
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
float aVal = A[i][k];
int j = 0;
// Process 4 at a time
for (; j <= n - 4; j += 4) {
C[i][j] += aVal * B[k][j];
C[i][j + 1] += aVal * B[k][j + 1];
C[i][j + 2] += aVal * B[k][j + 2];
C[i][j + 3] += aVal * B[k][j + 3];
}
// Remainder
for (; j < n; j++) {
C[i][j] += aVal * B[k][j];
}
}
}
return C;
}
// Version 4: Manual unroll 8x
static float[][] manualUnroll8(float[][] A, float[][] B) {
int n = A.length;
float[][] C = new float[n][n];
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
float aVal = A[i][k];
int j = 0;
// Process 8 at a time
for (; j <= n - 8; j += 8) {
C[i][j] += aVal * B[k][j];
C[i][j + 1] += aVal * B[k][j + 1];
C[i][j + 2] += aVal * B[k][j + 2];
C[i][j + 3] += aVal * B[k][j + 3];
C[i][j + 4] += aVal * B[k][j + 4];
C[i][j + 5] += aVal * B[k][j + 5];
C[i][j + 6] += aVal * B[k][j + 6];
C[i][j + 7] += aVal * B[k][j + 7];
}
// Remainder
for (; j < n; j++) {
C[i][j] += aVal * B[k][j];
}
}
}
return C;
}
static float[][] randomMatrix(int rows, int cols) {
float[][] matrix = new float[rows][cols];
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
matrix[i][j] = (float) Math.random();
}
}
return matrix;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment