Last active
October 16, 2025 08:11
-
-
Save corporatepiyush/12cb6b5d7864ce7a4f06361db3ea1298 to your computer and use it in GitHub Desktop.
Matrix Multiplication
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // blas.dart | |
| import 'dart:ffi' as ffi; | |
| import 'dart:typed_data'; | |
| import 'package:ffi/ffi.dart'; // Need this package! | |
| // Define the cblas_sgemm signature | |
| typedef CblasSgemmNative = | |
| ffi.Void Function( | |
| ffi.Int32 Order, | |
| ffi.Int32 TransA, | |
| ffi.Int32 TransB, | |
| ffi.Int32 M, | |
| ffi.Int32 N, | |
| ffi.Int32 K, | |
| ffi.Float alpha, | |
| ffi.Pointer<ffi.Float> A, | |
| ffi.Int32 lda, | |
| ffi.Pointer<ffi.Float> B, | |
| ffi.Int32 ldb, | |
| ffi.Float beta, | |
| ffi.Pointer<ffi.Float> C, | |
| ffi.Int32 ldc, | |
| ); | |
| typedef CblasSgemmDart = | |
| void Function( | |
| int Order, | |
| int TransA, | |
| int TransB, | |
| int M, | |
| int N, | |
| int K, | |
| double alpha, | |
| ffi.Pointer<ffi.Float> A, | |
| int lda, | |
| ffi.Pointer<ffi.Float> B, | |
| int ldb, | |
| double beta, | |
| ffi.Pointer<ffi.Float> C, | |
| int ldc, | |
| ); | |
| void main() { | |
| // Load Accelerate framework | |
| final accelerate = ffi.DynamicLibrary.open( | |
| '/System/Library/Frameworks/Accelerate.framework/Accelerate', | |
| ); | |
| // Lookup cblas_sgemm | |
| final cblas_sgemm = accelerate | |
| .lookup<ffi.NativeFunction<CblasSgemmNative>>('cblas_sgemm') | |
| .asFunction<CblasSgemmDart>(); | |
| const n = 1024; | |
| const size = n * n; | |
| // Allocate memory using calloc (from package:ffi) | |
| final A = calloc<ffi.Float>(size); | |
| final B = calloc<ffi.Float>(size); | |
| final C = calloc<ffi.Float>(size); | |
| // Fill with random data | |
| for (int i = 0; i < size; i++) { | |
| A[i] = 0.5; | |
| B[i] = 0.5; | |
| } | |
| final stopwatch = Stopwatch()..start(); | |
| cblas_sgemm( | |
| 101, // CblasRowMajor | |
| 111, // CblasNoTrans | |
| 111, // CblasNoTrans | |
| n, | |
| n, | |
| n, | |
| 1.0, | |
| A, | |
| n, | |
| B, | |
| n, | |
| 0.0, | |
| C, | |
| n, | |
| ); | |
| stopwatch.stop(); | |
| print( | |
| 'Dart + Accelerate: ${(stopwatch.elapsedMicroseconds / 1000000).toStringAsFixed(3)}s', | |
| ); | |
| // Cleanup | |
| calloc.free(A); | |
| calloc.free(B); | |
| calloc.free(C); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| /* | |
| #cgo LDFLAGS: -framework Accelerate | |
| #include <Accelerate/Accelerate.h> | |
| */ | |
| import "C" | |
| import ( | |
| "fmt" | |
| "math/rand" | |
| "time" | |
| "unsafe" | |
| ) | |
| func multiplyBLAS(A, B, C []float32, n int) { | |
| // Convert Go slices to C pointers | |
| pA := (*C.float)(unsafe.Pointer(&A[0])) | |
| pB := (*C.float)(unsafe.Pointer(&B[0])) | |
| pC := (*C.float)(unsafe.Pointer(&C[0])) | |
| // Call cblas_sgemm from Accelerate | |
| C.cblas_sgemm( | |
| C.CblasRowMajor, | |
| C.CblasNoTrans, | |
| C.CblasNoTrans, | |
| C.int(n), | |
| C.int(n), | |
| C.int(n), | |
| 1.0, | |
| pA, | |
| C.int(n), | |
| pB, | |
| C.int(n), | |
| 0.0, | |
| pC, | |
| C.int(n), | |
| ) | |
| } | |
| func main() { | |
| n := 1024 | |
| A := make([]float32, n*n) | |
| B := make([]float32, n*n) | |
| C := make([]float32, n*n) | |
| // Fill with random data | |
| for i := range A { | |
| A[i] = rand.Float32() | |
| B[i] = rand.Float32() | |
| } | |
| start := time.Now() | |
| multiplyBLAS(A, B, C, n) | |
| elapsed := time.Since(start) | |
| fmt.Printf("Go + Accelerate: %.3fs\n", elapsed.Seconds()) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { dlopen, FFIType, suffix } from 'bun:ffi'; | |
| // Load Accelerate framework | |
| const lib = dlopen('/System/Library/Frameworks/Accelerate.framework/Accelerate', { | |
| cblas_sgemm: { | |
| args: [ | |
| FFIType.i32, // Order | |
| FFIType.i32, // TransA | |
| FFIType.i32, // TransB | |
| FFIType.i32, // M | |
| FFIType.i32, // N | |
| FFIType.i32, // K | |
| FFIType.f32, // alpha | |
| FFIType.ptr, // A | |
| FFIType.i32, // lda | |
| FFIType.ptr, // B | |
| FFIType.i32, // ldb | |
| FFIType.f32, // beta | |
| FFIType.ptr, // C | |
| FFIType.i32, // ldc | |
| ], | |
| returns: FFIType.void, | |
| }, | |
| }); | |
| function multiplyBLAS(n) { | |
| const size = n * n; | |
| // Allocate aligned memory | |
| const A = new Float32Array(size); | |
| const B = new Float32Array(size); | |
| const C = new Float32Array(size); | |
| // Fill with random data | |
| for (let i = 0; i < size; i++) { | |
| A[i] = Math.random(); | |
| B[i] = Math.random(); | |
| } | |
| const start = performance.now(); | |
| lib.symbols.cblas_sgemm( | |
| 101, // CblasRowMajor | |
| 111, // CblasNoTrans | |
| 111, // CblasNoTrans | |
| n, | |
| n, | |
| n, | |
| 1.0, | |
| A, | |
| n, | |
| B, | |
| n, | |
| 0.0, | |
| C, | |
| n, | |
| ); | |
| const end = performance.now(); | |
| console.log(`Bun + Accelerate: ${((end - start) / 1000).toFixed(3)}s`); | |
| } | |
| multiplyBLAS(1024); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <time.h> | |
| #include <math.h> | |
| // REQUIRED for the M1/Apple Silicon native BLAS implementation (Accelerate Framework) | |
| // gcc -O3 -Wall -Wextra Matmul.c -o Matmul -lm -framework Accelerate | |
| #include "cblas.h" | |
| // --- Utility Functions --- | |
| /** | |
| * @brief Finds the minimum of two integers. | |
| */ | |
| int min(int a, int b) { | |
| return (a < b) ? a : b; | |
| } | |
| /** | |
| * @brief Generates a flat, row-major N x M matrix with random floats (0.0 to 1.0). | |
| * @param rows The number of rows (N). | |
| * @param cols The number of columns (M). | |
| * @return A pointer to the allocated float array. Must be freed by the caller. | |
| */ | |
| float *generate_random_matrix(int rows, int cols) { | |
| float *matrix = (float *)malloc(rows * cols * sizeof(float)); | |
| if (matrix == NULL) { | |
| perror("Memory allocation failed for matrix"); | |
| exit(EXIT_FAILURE); | |
| } | |
| for (int i = 0; i < rows * cols; i++) { | |
| // Generate random float between 0.0 and 1.0 | |
| matrix[i] = (float)rand() / (float)RAND_MAX; | |
| } | |
| return matrix; | |
| } | |
| /** | |
| * @brief Creates the transpose of a matrix (B_T from B). | |
| * @param B The original K x M matrix. | |
| * @param K, M The dimensions of B. | |
| * @return A pointer to the allocated M x K transposed float array (B_T). | |
| */ | |
| float *transpose_matrix(const float *B, int K, int M) { | |
| float *B_T = (float *)malloc(M * K * sizeof(float)); | |
| if (B_T == NULL) { | |
| perror("Memory allocation failed for transposed matrix"); | |
| exit(EXIT_FAILURE); | |
| } | |
| for (int k = 0; k < K; k++) { | |
| for (int m = 0; m < M; m++) { | |
| // B[k][m] goes to B_T[m][k] | |
| B_T[m * K + k] = B[k * M + m]; | |
| } | |
| } | |
| return B_T; | |
| } | |
| // --- Matrix Multiplication Versions --- | |
| // Define the function pointer type for matrix multiplication. | |
| // Use '__restrict' keyword for all pointers to assist the M1's compiler (clang). | |
| typedef void (*mm_func)(const float *__restrict, const float *__restrict, float *__restrict, int, int, int, int); | |
| /** | |
| * @brief Version 1: Naive (ijk order) matrix multiplication. | |
| */ | |
| void multiply_naive_ijk(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) { | |
| (void)tile_size; // Suppress unused parameter warning | |
| for (int i = 0; i < N; i++) { | |
| for (int j = 0; j < M; j++) { | |
| float sum = 0.0f; | |
| for (int k = 0; k < K; k++) { | |
| // A[i][k] * B[k][j]. B access is column-major (cache-unfriendly) | |
| sum += A[i * K + k] * B[k * M + j]; | |
| } | |
| C[i * M + j] = sum; | |
| } | |
| } | |
| } | |
| /** | |
| * @brief Version 2: Reordered (ikj order - cache friendly for A and C). | |
| */ | |
| void multiply_reordered_ikj(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) { | |
| (void)tile_size; // Suppress unused parameter warning | |
| for (int i = 0; i < N; i++) { | |
| int c_row_offset = i * M; | |
| for (int k = 0; k < K; k++) { | |
| // Load A[i][k] once per inner loop | |
| float a_val = A[i * K + k]; | |
| int b_row_offset = k * M; | |
| // Inner loop iterates over B row and C row (cache-friendly strides) | |
| for (int j = 0; j < M; j++) { | |
| C[c_row_offset + j] += a_val * B[b_row_offset + j]; | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * @brief Version 3: Tiled/Blocked matrix multiplication (ikj order inside blocks). | |
| * Exploits L1/L2 cache locality with block size. | |
| * @param TILE_SIZE The size of the square tile to use. | |
| */ | |
| void multiply_tiled(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int TILE_SIZE) { | |
| for (int i_tile = 0; i_tile < N; i_tile += TILE_SIZE) { | |
| for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE) { | |
| for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE) { | |
| int i_end = min(i_tile + TILE_SIZE, N); | |
| int j_end = min(j_tile + TILE_SIZE, M); | |
| int k_end = min(k_tile + TILE_SIZE, K); | |
| for (int i = i_tile; i < i_end; i++) { | |
| int c_row_offset = i * M; | |
| for (int k = k_tile; k < k_end; k++) { | |
| float a_val = A[i * K + k]; | |
| int b_row_offset = k * M; | |
| for (int j = j_tile; j < j_end; j++) { | |
| C[c_row_offset + j] += a_val * B[b_row_offset + j]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * @brief Version 4: Transposed Dot Product (A * B_T). M1 Optimized. | |
| * This function expects B to be pre-transposed and K/M to be swapped for B_T. | |
| * @param A Matrix N x K. | |
| * @param B_T Matrix M x K (transposed B). | |
| * @param C Matrix N x M. | |
| * @param N, K, M Dimensions. | |
| */ | |
| void multiply_transposed_dot(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) { | |
| (void)tile_size; // Suppress unused parameter warning | |
| // This turns the inner loop into a highly cache-friendly dot product of: | |
| // Row i of A (length K) and Row j of B_T (which is Column j of B, length K) | |
| for (int i = 0; i < N; i++) { | |
| for (int j = 0; j < M; j++) { | |
| float sum = 0.0f; | |
| int a_offset = i * K; | |
| int b_t_offset = j * K; | |
| // Inner loop iterates over K (Dot Product) - all accesses are sequential | |
| for (int k = 0; k < K; k++) { | |
| // A[i][k] * B_T[j][k] | |
| sum += A[a_offset + k] * B_T[b_t_offset + k]; | |
| } | |
| C[i * M + j] = sum; | |
| } | |
| } | |
| } | |
| /** | |
| * @brief Version 5: Transposed Dot Product with Loop Unrolling (Factor 4). | |
| * Unrolls the innermost dot product loop (k-loop) by a factor of 4 to reduce | |
| * loop overhead and potentially expose more ILP (Instruction-Level Parallelism). | |
| * This is the version you requested to include! | |
| */ | |
| void multiply_transposed_unrolled(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int tile_size) { | |
| (void)tile_size; // Suppress unused parameter warning | |
| // K_mod is the number of iterations that can be unrolled (must be a multiple of 4) | |
| int K_mod = K - (K % 4); | |
| for (int i = 0; i < N; i++) { | |
| for (int j = 0; j < M; j++) { | |
| float sum = 0.0f; | |
| int a_offset = i * K; | |
| int b_t_offset = j * K; | |
| int k = 0; | |
| // 1. Unrolled loop (k = 0 to K_mod - 1) | |
| for (; k < K_mod; k += 4) { | |
| // Calculate four multiplications and accumulations in one loop iteration | |
| sum += A[a_offset + k] * B_T[b_t_offset + k]; | |
| sum += A[a_offset + k + 1] * B_T[b_t_offset + k + 1]; | |
| sum += A[a_offset + k + 2] * B_T[b_t_offset + k + 2]; | |
| sum += A[a_offset + k + 3] * B_T[b_t_offset + k + 3]; | |
| } | |
| // 2. Cleanup loop (k = K_mod to K - 1) for remaining elements | |
| for (; k < K; k++) { | |
| sum += A[a_offset + k] * B_T[b_t_offset + k]; | |
| } | |
| C[i * M + j] = sum; | |
| } | |
| } | |
| } | |
| /** | |
| * @brief Version 6: Apple Accelerate/BLAS Optimized Matmul (cblas_sgemm). | |
| * This is the vendor-optimized method for peak performance on M1/M2/M3 chips. | |
| * This function performs C = alpha * A * B + beta * C. | |
| */ | |
| void multiply_cblas_sgemm(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) { | |
| (void)tile_size; // Suppress unused parameter warning | |
| const float alpha = 1.0f; // Scale factor for A * B | |
| const float beta = 0.0f; // Scale factor for C (0.0 since C is calloc'd) | |
| // CblasRowMajor: Matrices A, B, and C are stored in row-major order (C-style) | |
| // CblasNoTrans: Neither A nor B is transposed | |
| cblas_sgemm(CblasRowMajor, | |
| CblasNoTrans, | |
| CblasNoTrans, | |
| // M, N, K refer to the dimensions in the CblasRowMajor context: | |
| N, // M: rows of A and C | |
| M, // N: columns of B and C | |
| K, // K: inner dimension (columns of A / rows of B) | |
| alpha, | |
| A, | |
| K, // lda (leading dimension of A): K columns | |
| B, | |
| M, // ldb (leading dimension of B): M columns | |
| beta, | |
| C, | |
| M); // ldc (leading dimension of C): M columns | |
| } | |
| // --- Benchmark Runner --- | |
| /** | |
| * @brief Benchmarks a matrix multiplication function. | |
| * Runs 3 warmup iterations and 5 measured iterations, returning the best time. | |
| * @param name The name of the test. | |
| * @param func The matrix multiplication function to test. | |
| * @param A, B The input matrices. | |
| * @param N, K, M The dimensions. | |
| * @param tile_size The tile size to pass (used only by tiled version). | |
| * @return The best time measured in seconds. | |
| */ | |
| double benchmark_runner(const char *name, mm_func func, const float *A, const float *B, int N, int K, int M, int tile_size) { | |
| // Silence 'unused parameter' warning, as printing is handled in main() | |
| (void)name; | |
| // Warmup (3 times) | |
| for (int i = 0; i < 3; i++) { | |
| // calloc zeroes memory which is necessary for C += ... operations | |
| float *C_dummy = (float *)calloc(N * M, sizeof(float)); | |
| if (C_dummy == NULL) { | |
| perror("Warmup memory allocation failed"); | |
| exit(EXIT_FAILURE); | |
| } | |
| func(A, B, C_dummy, N, K, M, tile_size); | |
| free(C_dummy); | |
| } | |
| // Measure (take best of 5) | |
| double best_time = 0.0; | |
| for (int i = 0; i < 5; i++) { | |
| // Allocate and zero the result matrix C for each run | |
| float *C = (float *)calloc(N * M, sizeof(float)); | |
| if (C == NULL) { | |
| perror("Measurement memory allocation failed"); | |
| exit(EXIT_FAILURE); | |
| } | |
| // Use clock() for CPU time measurement | |
| clock_t start = clock(); | |
| func(A, B, C, N, K, M, tile_size); | |
| clock_t end = clock(); | |
| free(C); | |
| double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC; | |
| if (i == 0 || elapsed_sec < best_time) { | |
| best_time = elapsed_sec; | |
| } | |
| } | |
| return best_time; | |
| } | |
| // --- Main Program --- | |
| int main() { | |
| // Seed the random number generator once | |
| srand((unsigned int)time(NULL)); | |
| printf("C17 Matrix Multiplication Benchmark (M1 Pro Optimized)\n"); | |
| printf("====================================================\n\n"); | |
| int sizes[] = {256, 512, 1024, 2048}; // Added 2048 to push the M1's cache limits | |
| int num_sizes = sizeof(sizes) / sizeof(sizes[0]); | |
| int TILE_SIZE = 48; // Adjusted tile size to be slightly more aggressive for L1/L2 cache | |
| for (int s = 0; s < num_sizes; s++) { | |
| int size = sizes[s]; | |
| // For square matrices, N=K=M=size | |
| int N = size; | |
| int K = size; | |
| int M = size; | |
| printf("======================================================================\n"); | |
| printf("Matrix size: %d x %d (Total Data: %.1f MB)\n", size, size, (double)(3.0 * size * size * sizeof(float)) / (1024.0 * 1024.0)); | |
| printf("======================================================================\n"); | |
| // Generate matrices A and B | |
| float *A = generate_random_matrix(N, K); | |
| float *B = generate_random_matrix(K, M); | |
| // Allocate B_T once for transposed benchmarks | |
| printf("\nPreprocessing (Transpose B)... "); | |
| float *B_T = transpose_matrix(B, K, M); | |
| printf("Done.\n"); | |
| printf("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):\n\n"); | |
| double reordered_time = 0.0; | |
| double current_time = 0.0; | |
| double transposed_time = 0.0; | |
| double unrolled_time = 0.0; | |
| double cblas_time = 0.0; | |
| // 1. Naive (ijk) - Skip for large matrices | |
| if (size <= 512) { | |
| current_time = benchmark_runner("1. Naive (ijk)", multiply_naive_ijk, A, B, N, K, M, 0); | |
| printf("1. Naive (ijk): %.3fs\n", current_time); | |
| } else { | |
| printf("1. Naive (ijk): [skipped - too slow]\n"); | |
| } | |
| // 2. Reordered (ikj) - BASELINE | |
| reordered_time = benchmark_runner("2. Reordered (ikj)", multiply_reordered_ikj, A, B, N, K, M, 0); | |
| printf("2. Reordered (ikj): %.3fs\n", reordered_time); | |
| // 3. Tiled/Blocked | |
| current_time = benchmark_runner("3. Tiled", multiply_tiled, A, B, N, K, M, TILE_SIZE); | |
| printf("3. Tiled (%d): %.3fs (%.2fx)\n", | |
| TILE_SIZE, current_time, reordered_time / current_time); | |
| // 4. Transposed Dot Product (using B_T) | |
| transposed_time = benchmark_runner("4. Transposed Dot Product", multiply_transposed_dot, A, B_T, N, K, M, 0); | |
| printf("4. Transposed Dot Product: %.3fs (%.2fx)\n", | |
| transposed_time, reordered_time / transposed_time); | |
| // 5. Transposed Dot Product with Unrolling (using B_T) | |
| unrolled_time = benchmark_runner("5. Transposed Unrolled Dot", multiply_transposed_unrolled, A, B_T, N, K, M, 0); | |
| printf("5. Transposed Unrolled Dot (x4): %.3fs (%.2fx)\n", | |
| unrolled_time, reordered_time / unrolled_time); | |
| // 6. M1 Native BLAS (cblas_sgemm) | |
| cblas_time = benchmark_runner("6. M1 Native BLAS (cblas_sgemm)", multiply_cblas_sgemm, A, B, N, K, M, 0); | |
| printf("6. M1 Native BLAS (cblas_sgemm): %.3fs (%.2fx) <--- NATIVE SPEED\n", | |
| cblas_time, reordered_time / cblas_time); | |
| // Clean up memory for the current size | |
| free(A); | |
| free(B); | |
| free(B_T); // Free the transposed matrix | |
| printf("\n"); | |
| } | |
| printf("====================================================\n"); | |
| printf("Benchmark Complete.\n"); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import 'dart:math'; | |
| import 'dart:typed_data'; | |
| import 'dart:io'; | |
| // A record to hold the name and time of a benchmark result. | |
| typedef BenchmarkResult = ({String name, double time}); | |
| /// Generates a matrix of a given size with random double values. | |
| List<List<double>> randomMatrix(int rows, int cols) { | |
| final random = Random(); | |
| return List.generate( | |
| rows, | |
| (_) => List.generate(cols, (_) => random.nextDouble()), | |
| ); | |
| } | |
| /// Version 1: Naive (ijk order) matrix multiplication. | |
| List<List<double>> naiveMultiply(List<List<double>> A, List<List<double>> B) { | |
| final n = A.length; | |
| final k = A[0].length; | |
| final m = B[0].length; | |
| final C = List.generate(n, (_) => List.filled(m, 0.0)); | |
| for (int i = 0; i < n; i++) { | |
| for (int j = 0; j < m; j++) { | |
| double sum = 0.0; | |
| for (int kIdx = 0; kIdx < k; kIdx++) { | |
| sum += A[i][kIdx] * B[kIdx][j]; | |
| } | |
| C[i][j] = sum; | |
| } | |
| } | |
| return C; | |
| } | |
| /// Version 2: Reordered (ikj order) for better cache performance. | |
| List<List<double>> reorderedMultiply( | |
| List<List<double>> A, | |
| List<List<double>> B, | |
| ) { | |
| final n = A.length; | |
| final k = A[0].length; | |
| final m = B[0].length; | |
| final C = List.generate(n, (_) => List.filled(m, 0.0)); | |
| for (int i = 0; i < n; i++) { | |
| for (int kIdx = 0; kIdx < k; kIdx++) { | |
| final aVal = A[i][kIdx]; | |
| for (int j = 0; j < m; j++) { | |
| C[i][j] += aVal * B[kIdx][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| /// Version 3: Tiled/Blocked matrix multiplication. | |
| List<List<double>> tiledMultiply( | |
| List<List<double>> A, | |
| List<List<double>> B, | |
| int tileSize, | |
| ) { | |
| final n = A.length; | |
| final k = A[0].length; | |
| final m = B[0].length; | |
| final C = List.generate(n, (_) => List.filled(m, 0.0)); | |
| for (int iTile = 0; iTile < n; iTile += tileSize) { | |
| for (int jTile = 0; jTile < m; jTile += tileSize) { | |
| for (int kTile = 0; kTile < k; kTile += tileSize) { | |
| final iEnd = min(iTile + tileSize, n); | |
| final jEnd = min(jTile + tileSize, m); | |
| final kEnd = min(kTile + tileSize, k); | |
| for (int i = iTile; i < iEnd; i++) { | |
| for (int kIdx = kTile; kIdx < kEnd; kIdx++) { | |
| final aVal = A[i][kIdx]; | |
| for (int j = jTile; j < jEnd; j++) { | |
| C[i][j] += aVal * B[kIdx][j]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| /// Version 4: Manual loop unrolling by a factor of 4. | |
| List<List<double>> unrolled4Multiply( | |
| List<List<double>> A, | |
| List<List<double>> B, | |
| ) { | |
| final n = A.length; | |
| final k = A[0].length; | |
| final m = B[0].length; | |
| final C = List.generate(n, (_) => List.filled(m, 0.0)); | |
| for (int i = 0; i < n; i++) { | |
| for (int kIdx = 0; kIdx < k; kIdx++) { | |
| final aVal = A[i][kIdx]; | |
| int j = 0; | |
| // Process 4 elements at a time | |
| for (; j <= m - 4; j += 4) { | |
| C[i][j] += aVal * B[kIdx][j]; | |
| C[i][j + 1] += aVal * B[kIdx][j + 1]; | |
| C[i][j + 2] += aVal * B[kIdx][j + 2]; | |
| C[i][j + 3] += aVal * B[kIdx][j + 3]; | |
| } | |
| // Handle the remainder | |
| for (; j < m; j++) { | |
| C[i][j] += aVal * B[kIdx][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| /// Version 5: Uses flat Float32List to avoid 2D list overhead. | |
| Float32List flatArrayMultiply( | |
| Float32List A, | |
| Float32List B, | |
| int n, | |
| int k, | |
| int m, | |
| ) { | |
| final C = Float32List(n * m); // Automatically initialized to 0 | |
| for (int i = 0; i < n; i++) { | |
| for (int kIdx = 0; kIdx < k; kIdx++) { | |
| final aVal = A[i * k + kIdx]; | |
| final bRowOffset = kIdx * m; | |
| final cRowOffset = i * m; | |
| for (int j = 0; j < m; j++) { | |
| C[cRowOffset + j] += aVal * B[bRowOffset + j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| /// Helper to convert a 2D list to a flat Float32List. | |
| Float32List toFlatList(List<List<double>> matrix) { | |
| final rows = matrix.length; | |
| if (rows == 0) return Float32List(0); | |
| final cols = matrix[0].length; | |
| final flat = Float32List(rows * cols); | |
| for (int i = 0; i < rows; i++) { | |
| for (int j = 0; j < cols; j++) { | |
| flat[i * cols + j] = matrix[i][j]; | |
| } | |
| } | |
| return flat; | |
| } | |
| /// Runs a function multiple times to allow the JIT compiler to warm up. | |
| void warmupFunction(String name, void Function() fn) { | |
| print(' Warming up: $name...'); | |
| const iterations = 10; | |
| for (int i = 0; i < iterations; i++) { | |
| fn(); | |
| } | |
| } | |
| /// Benchmarks a function by running it multiple times and taking the minimum time. | |
| BenchmarkResult benchmark(String name, Function fn) { | |
| const iterations = 5; | |
| final times = <double>[]; | |
| final stopwatch = Stopwatch(); | |
| for (int i = 0; i < iterations; i++) { | |
| stopwatch.start(); | |
| fn(); | |
| stopwatch.stop(); | |
| // Convert microseconds to seconds | |
| times.add(stopwatch.elapsedMicroseconds / 1000000.0); | |
| stopwatch.reset(); | |
| } | |
| // Take the minimum time (best run = least interference) | |
| final time = times.reduce(min); | |
| return (name: name, time: time); | |
| } | |
| /// Main test runner with proper warmup. | |
| void runTests() { | |
| print('Dart Matrix Multiplication - PROPER WARMUP VERSION\n'); | |
| print('Runtime: Dart ${Platform.version.split(' ').first}'); | |
| print('Platform: ${Platform.operatingSystem}\n'); | |
| final sizes = [256, 512, 1024]; | |
| for (final size in sizes) { | |
| print('='.padRight(70, '=')); | |
| final mem = (3 * size * size * 4) / (1024 * 1024); | |
| print('Matrix size: ${size} x ${size} (${mem.toStringAsFixed(1)} MB)'); | |
| print('='.padRight(70, '=')); | |
| final A = randomMatrix(size, size); | |
| final B = randomMatrix(size, size); | |
| final A_flat = toFlatList(A); | |
| final B_flat = toFlatList(B); | |
| print('\n🔥 WARMUP PHASE (10 iterations each):'); | |
| // Warmup all functions | |
| if (size <= 512) { | |
| warmupFunction('naiveMultiply', () => naiveMultiply(A, B)); | |
| } | |
| warmupFunction('reorderedMultiply', () => reorderedMultiply(A, B)); | |
| warmupFunction('tiledMultiply', () => tiledMultiply(A, B, 64)); | |
| warmupFunction('unrolled4Multiply', () => unrolled4Multiply(A, B)); | |
| warmupFunction( | |
| 'flatArrayMultiply', | |
| () => flatArrayMultiply(A_flat, B_flat, size, size, size), | |
| ); | |
| print('\n⚡ BENCHMARKING (5 runs, taking best):'); | |
| final results = <BenchmarkResult>[]; | |
| // Test 1: Naive | |
| if (size <= 512) { | |
| final r1 = benchmark('Naive (ijk)', () => naiveMultiply(A, B)); | |
| results.add(r1); | |
| print('1. Naive (ijk): ${r1.time.toStringAsFixed(3)}s'); | |
| } else { | |
| print('1. Naive (ijk): [skipped - too slow]'); | |
| } | |
| // Test 2: Reordered (Baseline) | |
| final r2 = benchmark('Reordered (ikj)', () => reorderedMultiply(A, B)); | |
| results.add(r2); | |
| print('2. Reordered (ikj): ${r2.time.toStringAsFixed(3)}s'); | |
| final baseTime = r2.time; | |
| // Test 3: Tiled | |
| const tileSize = 64; | |
| final r3 = benchmark( | |
| 'Tiled (${tileSize}x$tileSize)', | |
| () => tiledMultiply(A, B, tileSize), | |
| ); | |
| results.add(r3); | |
| print( | |
| '3. Tiled ($tileSize): ${r3.time.toStringAsFixed(3)}s (${(baseTime / r3.time).toStringAsFixed(2)}x)', | |
| ); | |
| // Test 4: Unrolled | |
| final r4 = benchmark('Unrolled 4x', () => unrolled4Multiply(A, B)); | |
| results.add(r4); | |
| print( | |
| '4. Unrolled 4x: ${r4.time.toStringAsFixed(3)}s (${(baseTime / r4.time).toStringAsFixed(2)}x)', | |
| ); | |
| // Test 5: Flat arrays (TypedData) | |
| final r5 = benchmark( | |
| 'Flat Float32List', | |
| () => flatArrayMultiply(A_flat, B_flat, size, size, size), | |
| ); | |
| results.add(r5); | |
| print( | |
| '5. Flat Float32List: ${r5.time.toStringAsFixed(3)}s (${(baseTime / r5.time).toStringAsFixed(2)}x)', | |
| ); | |
| // Find the best performing version | |
| final best = results.reduce((a, b) => a.time < b.time ? a : b); | |
| final baseline = results.firstWhere((r) => r.name == 'Reordered (ikj)'); | |
| print('\n✨ Winner: ${best.name} (${best.time.toStringAsFixed(3)}s)'); | |
| print( | |
| '📊 Speedup vs Reordered: ${(baseline.time / best.time).toStringAsFixed(2)}x\n', | |
| ); | |
| } | |
| } | |
| void main() { | |
| runTests(); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| import ( | |
| "fmt" | |
| "math/rand" | |
| "strings" | |
| "time" | |
| ) | |
| // Generate random matrix | |
| func randomMatrix(rows, cols int) [][]float32 { | |
| matrix := make([][]float32, rows) | |
| for i := range matrix { | |
| matrix[i] = make([]float32, cols) | |
| for j := range matrix[i] { | |
| matrix[i][j] = rand.Float32() | |
| } | |
| } | |
| return matrix | |
| } | |
| // Version 1: Naive (ijk order) | |
| func naiveMultiply(A, B [][]float32) [][]float32 { | |
| n := len(A) | |
| k := len(A[0]) | |
| m := len(B[0]) | |
| C := make([][]float32, n) | |
| for i := range C { | |
| C[i] = make([]float32, m) | |
| } | |
| for i := 0; i < n; i++ { | |
| for j := 0; j < m; j++ { | |
| var sum float32 | |
| for kIdx := 0; kIdx < k; kIdx++ { | |
| sum += A[i][kIdx] * B[kIdx][j] | |
| } | |
| C[i][j] = sum | |
| } | |
| } | |
| return C | |
| } | |
| // Version 2: Reordered (ikj order - cache friendly) | |
| func reorderedMultiply(A, B [][]float32) [][]float32 { | |
| n := len(A) | |
| k := len(A[0]) | |
| m := len(B[0]) | |
| C := make([][]float32, n) | |
| for i := range C { | |
| C[i] = make([]float32, m) | |
| } | |
| for i := 0; i < n; i++ { | |
| for kIdx := 0; kIdx < k; kIdx++ { | |
| aVal := A[i][kIdx] | |
| for j := 0; j < m; j++ { | |
| C[i][j] += aVal * B[kIdx][j] | |
| } | |
| } | |
| } | |
| return C | |
| } | |
| // Version 3: Tiled/Blocked | |
| func tiledMultiply(A, B [][]float32, tileSize int) [][]float32 { | |
| n := len(A) | |
| k := len(A[0]) | |
| m := len(B[0]) | |
| C := make([][]float32, n) | |
| for i := range C { | |
| C[i] = make([]float32, m) | |
| } | |
| for iTile := 0; iTile < n; iTile += tileSize { | |
| for jTile := 0; jTile < m; jTile += tileSize { | |
| for kTile := 0; kTile < k; kTile += tileSize { | |
| iEnd := min(iTile+tileSize, n) | |
| jEnd := min(jTile+tileSize, m) | |
| kEnd := min(kTile+tileSize, k) | |
| for i := iTile; i < iEnd; i++ { | |
| for kIdx := kTile; kIdx < kEnd; kIdx++ { | |
| aVal := A[i][kIdx] | |
| for j := jTile; j < jEnd; j++ { | |
| C[i][j] += aVal * B[kIdx][j] | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return C | |
| } | |
| // Version 4: Manual unroll 4x | |
| func unrolled4Multiply(A, B [][]float32) [][]float32 { | |
| n := len(A) | |
| k := len(A[0]) | |
| m := len(B[0]) | |
| C := make([][]float32, n) | |
| for i := range C { | |
| C[i] = make([]float32, m) | |
| } | |
| for i := 0; i < n; i++ { | |
| for kIdx := 0; kIdx < k; kIdx++ { | |
| aVal := A[i][kIdx] | |
| j := 0 | |
| // Process 4 at a time | |
| for ; j <= m-4; j += 4 { | |
| C[i][j] += aVal * B[kIdx][j] | |
| C[i][j+1] += aVal * B[kIdx][j+1] | |
| C[i][j+2] += aVal * B[kIdx][j+2] | |
| C[i][j+3] += aVal * B[kIdx][j+3] | |
| } | |
| // Handle remainder | |
| for ; j < m; j++ { | |
| C[i][j] += aVal * B[kIdx][j] | |
| } | |
| } | |
| } | |
| return C | |
| } | |
| // Version 5: Flat slice (single allocation) | |
| func flatSliceMultiply(A, B []float32, n, k, m int) []float32 { | |
| C := make([]float32, n*m) | |
| for i := 0; i < n; i++ { | |
| for kIdx := 0; kIdx < k; kIdx++ { | |
| aVal := A[i*k+kIdx] | |
| bRowOffset := kIdx * m | |
| cRowOffset := i * m | |
| for j := 0; j < m; j++ { | |
| C[cRowOffset+j] += aVal * B[bRowOffset+j] | |
| } | |
| } | |
| } | |
| return C | |
| } | |
| // Version 6: Flat slice with unrolling | |
| func flatSliceUnrolled(A, B []float32, n, k, m int) []float32 { | |
| C := make([]float32, n*m) | |
| for i := 0; i < n; i++ { | |
| for kIdx := 0; kIdx < k; kIdx++ { | |
| aVal := A[i*k+kIdx] | |
| bRowOffset := kIdx * m | |
| cRowOffset := i * m | |
| j := 0 | |
| // Unroll by 8 | |
| for ; j <= m-8; j += 8 { | |
| C[cRowOffset+j] += aVal * B[bRowOffset+j] | |
| C[cRowOffset+j+1] += aVal * B[bRowOffset+j+1] | |
| C[cRowOffset+j+2] += aVal * B[bRowOffset+j+2] | |
| C[cRowOffset+j+3] += aVal * B[bRowOffset+j+3] | |
| C[cRowOffset+j+4] += aVal * B[bRowOffset+j+4] | |
| C[cRowOffset+j+5] += aVal * B[bRowOffset+j+5] | |
| C[cRowOffset+j+6] += aVal * B[bRowOffset+j+6] | |
| C[cRowOffset+j+7] += aVal * B[bRowOffset+j+7] | |
| } | |
| // Handle remainder | |
| for ; j < m; j++ { | |
| C[cRowOffset+j] += aVal * B[bRowOffset+j] | |
| } | |
| } | |
| } | |
| return C | |
| } | |
| // Helper to convert 2D to flat | |
| func toFlat(matrix [][]float32) []float32 { | |
| rows := len(matrix) | |
| cols := len(matrix[0]) | |
| flat := make([]float32, rows*cols) | |
| for i := 0; i < rows; i++ { | |
| for j := 0; j < cols; j++ { | |
| flat[i*cols+j] = matrix[i][j] | |
| } | |
| } | |
| return flat | |
| } | |
| // Benchmark helper that returns time | |
| func benchmark(name string, fn func()) time.Duration { | |
| // Warmup | |
| for i := 0; i < 3; i++ { | |
| fn() | |
| } | |
| // Measure (take best of 5) | |
| var bestTime time.Duration | |
| for i := 0; i < 5; i++ { | |
| start := time.Now() | |
| fn() | |
| elapsed := time.Since(start) | |
| if bestTime == 0 || elapsed < bestTime { | |
| bestTime = elapsed | |
| } | |
| } | |
| return bestTime | |
| } | |
| func main() { | |
| fmt.Println("Go 1.25 Matrix Multiplication Benchmark") | |
| fmt.Println("========================================\n") | |
| sizes := []int{256, 512, 1024} | |
| for _, size := range sizes { | |
| fmt.Println(strings.Repeat("=", 70)) | |
| fmt.Printf("Matrix size: %d x %d (%.1f MB)\n", size, size, float64(3*size*size*4)/(1024*1024)) | |
| fmt.Println(strings.Repeat("=", 70)) | |
| A := randomMatrix(size, size) | |
| B := randomMatrix(size, size) | |
| AFlat := toFlat(A) | |
| BFlat := toFlat(B) | |
| fmt.Println("\n🔥 Warmup + Benchmark (3 warmup, 5 measured, best time):") | |
| fmt.Println() | |
| // Test naive (skip for large) | |
| if size <= 512 { | |
| t := benchmark("1. Naive (ijk)", func() { | |
| _ = naiveMultiply(A, B) | |
| }) | |
| fmt.Printf("1. Naive (ijk): %.3fs\n", t.Seconds()) | |
| } else { | |
| fmt.Println("1. Naive (ijk): [skipped - too slow]") | |
| } | |
| // Test reordered (baseline) | |
| reorderedTime := benchmark("2. Reordered (ikj)", func() { | |
| _ = reorderedMultiply(A, B) | |
| }) | |
| fmt.Printf("2. Reordered (ikj): %.3fs\n", reorderedTime.Seconds()) | |
| // Test tiled | |
| tileSize := 64 | |
| tiledTime := benchmark(fmt.Sprintf("3. Tiled (%d)", tileSize), func() { | |
| _ = tiledMultiply(A, B, tileSize) | |
| }) | |
| fmt.Printf("3. Tiled (%d): %.3fs (%.2fx)\n", | |
| tileSize, tiledTime.Seconds(), reorderedTime.Seconds()/tiledTime.Seconds()) | |
| // Test unrolled 4x | |
| unrolled4Time := benchmark("4. Unrolled 4x (2D slices)", func() { | |
| _ = unrolled4Multiply(A, B) | |
| }) | |
| fmt.Printf("4. Unrolled 4x (2D slices): %.3fs (%.2fx)\n", | |
| unrolled4Time.Seconds(), reorderedTime.Seconds()/unrolled4Time.Seconds()) | |
| // Test flat slice | |
| flatTime := benchmark("5. Flat slice", func() { | |
| _ = flatSliceMultiply(AFlat, BFlat, size, size, size) | |
| }) | |
| fmt.Printf("5. Flat slice: %.3fs (%.2fx)\n", | |
| flatTime.Seconds(), reorderedTime.Seconds()/flatTime.Seconds()) | |
| // Test flat slice unrolled | |
| flatUnrolledTime := benchmark("6. Flat slice + unroll 8x", func() { | |
| _ = flatSliceUnrolled(AFlat, BFlat, size, size, size) | |
| }) | |
| fmt.Printf("6. Flat slice + unroll 8x: %.3fs (%.2fx)\n", | |
| flatUnrolledTime.Seconds(), reorderedTime.Seconds()/flatUnrolledTime.Seconds()) | |
| // Find winner | |
| times := map[string]time.Duration{ | |
| "Reordered": reorderedTime, | |
| "Tiled": tiledTime, | |
| "Unrolled 4x": unrolled4Time, | |
| "Flat slice": flatTime, | |
| "Flat + unroll 8x": flatUnrolledTime, | |
| } | |
| var bestName string | |
| var bestTime time.Duration | |
| for name, t := range times { | |
| if bestTime == 0 || t < bestTime { | |
| bestTime = t | |
| bestName = name | |
| } | |
| } | |
| fmt.Printf("\n✨ Winner: %s (%.3fs)\n", bestName, bestTime.Seconds()) | |
| fmt.Printf("📊 Speedup vs Reordered: %.2fx\n\n", reorderedTime.Seconds()/bestTime.Seconds()) | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public class Matmul { | |
| public static void main(String[] args) { | |
| System.out.println("Matrix Multiplication - Optimized for M1 (128-byte cache lines)\n"); | |
| // Cache line holds 128 bytes / 4 bytes per float = 32 floats | |
| final int FLOATS_PER_CACHE_LINE = 32; | |
| System.out.println("M1 Architecture:"); | |
| System.out.println(" Cache line size: 128 bytes"); | |
| System.out.println(" Floats per cache line: " + FLOATS_PER_CACHE_LINE); | |
| System.out.println(); | |
| int[] sizes = { 512, 1024, 2048 }; | |
| for (int size : sizes) { | |
| System.out.println("=".repeat(70)); | |
| System.out.printf("Matrix size: %d x %d (%.1f MB total)\n", size, size, (3.0 * size * size * 4) / (1024 * 1024)); | |
| System.out.println("=".repeat(70)); | |
| float[][] A = randomMatrix(size, size); | |
| float[][] B = randomMatrix(size, size); | |
| // 1. Naive (skip for large matrices) | |
| if (size <= 512) { | |
| long start = System.nanoTime(); | |
| float[][] C1 = naiveMultiply(A, B); | |
| double naiveTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("1. Naive: %.3f seconds (baseline)\n", naiveTime); | |
| } else { | |
| System.out.println("1. Naive: [skipped - too slow!]"); | |
| } | |
| // 2. Reordered | |
| long start = System.nanoTime(); | |
| float[][] C2 = reorderedMultiply(A, B); | |
| double reorderedTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("2. Reordered: %.3f seconds\n", reorderedTime); | |
| // 3. Cache-line aligned (process 32 floats at a time) | |
| start = System.nanoTime(); | |
| float[][] C3 = cacheLineAligned(A, B); | |
| double alignedTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("3. Cache-line aligned: %.3f seconds (%.2fx vs reordered)\n", alignedTime, reorderedTime / alignedTime); | |
| // 4. Try different tile sizes (M1 has huge caches!) | |
| System.out.println("4. Tiled versions:"); | |
| int[] tileSizes = { 64, 128, 256, 512 }; | |
| double bestTime = Double.MAX_VALUE; | |
| int bestTile = 0; | |
| for (int tileSize : tileSizes) { | |
| start = System.nanoTime(); | |
| float[][] C4 = tiledMultiply(A, B, tileSize); | |
| double tiledTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf(" Tile size %4d: %.3f seconds (%.2fx vs reordered)\n", tileSize, tiledTime, reorderedTime / tiledTime); | |
| if (tiledTime < bestTime) { | |
| bestTime = tiledTime; | |
| bestTile = tileSize; | |
| } | |
| } | |
| System.out.printf("\n✨ Best configuration: Tile size %d (%.2fx faster than reordered)\n\n", bestTile, reorderedTime / bestTime); | |
| } | |
| } | |
| // NAIVE | |
| static float[][] naiveMultiply(float[][] A, float[][] B) { | |
| int n = A.length, | |
| k = A[0].length, | |
| m = B[0].length; | |
| float[][] C = new float[n][m]; | |
| for (int row = 0; row < n; row++) { | |
| for (int col = 0; col < m; col++) { | |
| for (int i = 0; i < k; i++) { | |
| C[row][col] += A[row][i] * B[i][col]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // REORDERED | |
| static float[][] reorderedMultiply(float[][] A, float[][] B) { | |
| int n = A.length, | |
| k = A[0].length, | |
| m = B[0].length; | |
| float[][] C = new float[n][m]; | |
| for (int row = 0; row < n; row++) { | |
| for (int i = 0; i < k; i++) { | |
| float aValue = A[row][i]; | |
| for (int col = 0; col < m; col++) { | |
| C[row][col] += aValue * B[i][col]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // CACHE-LINE ALIGNED: Process 32 floats at once (128 bytes) | |
| static float[][] cacheLineAligned(float[][] A, float[][] B) { | |
| int n = A.length, | |
| k = A[0].length, | |
| m = B[0].length; | |
| float[][] C = new float[n][m]; | |
| final int BLOCK = 32; // 32 floats = 128 bytes = 1 cache line on M1 | |
| for (int row = 0; row < n; row++) { | |
| for (int i = 0; i < k; i++) { | |
| float aValue = A[row][i]; | |
| // Process 32 columns at a time (1 cache line) | |
| int col = 0; | |
| for (; col <= m - BLOCK; col += BLOCK) { | |
| // These 32 accesses hit the same cache line! | |
| for (int j = 0; j < BLOCK; j++) { | |
| C[row][col + j] += aValue * B[i][col + j]; | |
| } | |
| } | |
| // Handle remaining columns | |
| for (; col < m; col++) { | |
| C[row][col] += aValue * B[i][col]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // TILED | |
| static float[][] tiledMultiply(float[][] A, float[][] B, int tileSize) { | |
| int n = A.length, | |
| k = A[0].length, | |
| m = B[0].length; | |
| float[][] C = new float[n][m]; | |
| for (int rowTile = 0; rowTile < n; rowTile += tileSize) { | |
| for (int colTile = 0; colTile < m; colTile += tileSize) { | |
| for (int innerTile = 0; innerTile < k; innerTile += tileSize) { | |
| int rowEnd = Math.min(rowTile + tileSize, n); | |
| int colEnd = Math.min(colTile + tileSize, m); | |
| int innerEnd = Math.min(innerTile + tileSize, k); | |
| for (int row = rowTile; row < rowEnd; row++) { | |
| for (int i = innerTile; i < innerEnd; i++) { | |
| float aValue = A[row][i]; | |
| for (int col = colTile; col < colEnd; col++) { | |
| C[row][col] += aValue * B[i][col]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| static float[][] randomMatrix(int rows, int cols) { | |
| float[][] matrix = new float[rows][cols]; | |
| for (int i = 0; i < rows; i++) { | |
| for (int j = 0; j < cols; j++) { | |
| matrix[i][j] = (float) Math.random(); | |
| } | |
| } | |
| return matrix; | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Matrix Multiplication with PROPER WARMUP | |
| function randomMatrix(rows, cols) { | |
| const matrix = []; | |
| for (let i = 0; i < rows; i++) { | |
| matrix[i] = []; | |
| for (let j = 0; j < cols; j++) { | |
| matrix[i][j] = Math.random(); | |
| } | |
| } | |
| return matrix; | |
| } | |
| // Version 1: Naive (ijk order) | |
| function naiveMultiply(A, B) { | |
| const n = A.length; | |
| const k = A[0].length; | |
| const m = B[0].length; | |
| const C = Array(n) | |
| .fill(0) | |
| .map(() => Array(m).fill(0)); | |
| for (let i = 0; i < n; i++) { | |
| for (let j = 0; j < m; j++) { | |
| let sum = 0; | |
| for (let k_idx = 0; k_idx < k; k_idx++) { | |
| sum += A[i][k_idx] * B[k_idx][j]; | |
| } | |
| C[i][j] = sum; | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 2: Reordered (ikj order - cache friendly) | |
| function reorderedMultiply(A, B) { | |
| const n = A.length; | |
| const k = A[0].length; | |
| const m = B[0].length; | |
| const C = Array(n) | |
| .fill(0) | |
| .map(() => Array(m).fill(0)); | |
| for (let i = 0; i < n; i++) { | |
| for (let k_idx = 0; k_idx < k; k_idx++) { | |
| const aVal = A[i][k_idx]; | |
| for (let j = 0; j < m; j++) { | |
| C[i][j] += aVal * B[k_idx][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 3: Tiled/Blocked | |
| function tiledMultiply(A, B, tileSize) { | |
| const n = A.length; | |
| const k = A[0].length; | |
| const m = B[0].length; | |
| const C = Array(n) | |
| .fill(0) | |
| .map(() => Array(m).fill(0)); | |
| for (let iTile = 0; iTile < n; iTile += tileSize) { | |
| for (let jTile = 0; jTile < m; jTile += tileSize) { | |
| for (let kTile = 0; kTile < k; kTile += tileSize) { | |
| const iEnd = Math.min(iTile + tileSize, n); | |
| const jEnd = Math.min(jTile + tileSize, m); | |
| const kEnd = Math.min(kTile + tileSize, k); | |
| for (let i = iTile; i < iEnd; i++) { | |
| for (let k_idx = kTile; k_idx < kEnd; k_idx++) { | |
| const aVal = A[i][k_idx]; | |
| for (let j = jTile; j < jEnd; j++) { | |
| C[i][j] += aVal * B[k_idx][j]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 4: Manual unroll 4x | |
| function unrolled4Multiply(A, B) { | |
| const n = A.length; | |
| const k = A[0].length; | |
| const m = B[0].length; | |
| const C = Array(n) | |
| .fill(0) | |
| .map(() => Array(m).fill(0)); | |
| for (let i = 0; i < n; i++) { | |
| for (let k_idx = 0; k_idx < k; k_idx++) { | |
| const aVal = A[i][k_idx]; | |
| let j = 0; | |
| // Process 4 at a time | |
| for (; j <= m - 4; j += 4) { | |
| C[i][j] += aVal * B[k_idx][j]; | |
| C[i][j + 1] += aVal * B[k_idx][j + 1]; | |
| C[i][j + 2] += aVal * B[k_idx][j + 2]; | |
| C[i][j + 3] += aVal * B[k_idx][j + 3]; | |
| } | |
| // Handle remainder | |
| for (; j < m; j++) { | |
| C[i][j] += aVal * B[k_idx][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 5: Flat arrays (avoid 2D array overhead) | |
| function flatArrayMultiply(A, B, n, k, m) { | |
| const C = new Float32Array(n * m); | |
| for (let i = 0; i < n; i++) { | |
| for (let k_idx = 0; k_idx < k; k_idx++) { | |
| const aVal = A[i * k + k_idx]; | |
| const bRowOffset = k_idx * m; | |
| const cRowOffset = i * m; | |
| for (let j = 0; j < m; j++) { | |
| C[cRowOffset + j] += aVal * B[bRowOffset + j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // Helper to convert 2D to flat | |
| function toFlatArray(matrix) { | |
| const rows = matrix.length; | |
| const cols = matrix[0].length; | |
| const flat = new Float32Array(rows * cols); | |
| for (let i = 0; i < rows; i++) { | |
| for (let j = 0; j < cols; j++) { | |
| flat[i * cols + j] = matrix[i][j]; | |
| } | |
| } | |
| return flat; | |
| } | |
| // PROPER warmup function | |
| function warmupFunction(fn, ...args) { | |
| console.log(` Warming up: ${fn.name}...`); | |
| const iterations = 10; | |
| for (let i = 0; i < iterations; i++) { | |
| fn(...args); | |
| } | |
| } | |
| // Benchmark function with minimal overhead | |
| function benchmark(name, fn, ...args) { | |
| const iterations = 5; | |
| const times = []; | |
| for (let i = 0; i < iterations; i++) { | |
| const start = performance.now(); | |
| const result = fn(...args); | |
| const end = performance.now(); | |
| times.push((end - start) / 1000); | |
| } | |
| // Take the minimum time (best run = least interference) | |
| const time = Math.min(...times); | |
| return { name, time }; | |
| } | |
| // Main test with proper warmup | |
| function runTests() { | |
| console.log('JavaScript Matrix Multiplication - PROPER WARMUP VERSION\n'); | |
| console.log(`Runtime: ${typeof process !== 'undefined' ? 'Node.js ' + process.version : 'Browser'}`); | |
| console.log(`Platform: ${typeof process !== 'undefined' ? process.platform + ' ' + process.arch : 'Browser'}\n`); | |
| const sizes = [256, 512, 1024]; | |
| for (const size of sizes) { | |
| console.log('='.repeat(70)); | |
| console.log(`Matrix size: ${size} x ${size} (${((3 * size * size * 4) / (1024 * 1024)).toFixed(1)} MB)`); | |
| console.log('='.repeat(70)); | |
| const A = randomMatrix(size, size); | |
| const B = randomMatrix(size, size); | |
| const A_flat = toFlatArray(A); | |
| const B_flat = toFlatArray(B); | |
| console.log('\n🔥 WARMUP PHASE (10 iterations each):'); | |
| // Warmup all functions | |
| if (size <= 512) { | |
| warmupFunction(naiveMultiply, A, B); | |
| } | |
| warmupFunction(reorderedMultiply, A, B); | |
| warmupFunction(tiledMultiply, A, B, 64); | |
| warmupFunction(unrolled4Multiply, A, B); | |
| warmupFunction(flatArrayMultiply, A_flat, B_flat, size, size, size); | |
| console.log('\n⚡ BENCHMARKING (5 runs, taking best):'); | |
| const results = []; | |
| // Test 1: Naive | |
| if (size <= 512) { | |
| const r1 = benchmark('Naive (ijk)', naiveMultiply, A, B); | |
| results.push(r1); | |
| console.log(`1. Naive (ijk): ${r1.time.toFixed(3)}s`); | |
| } else { | |
| console.log(`1. Naive (ijk): [skipped - too slow]`); | |
| } | |
| // Test 2: Reordered | |
| const r2 = benchmark('Reordered (ikj)', reorderedMultiply, A, B); | |
| results.push(r2); | |
| console.log(`2. Reordered (ikj): ${r2.time.toFixed(3)}s`); | |
| const baseTime = r2.time; | |
| // Test 3: Tiled | |
| const tileSize = 64; | |
| const r3 = benchmark(`Tiled (${tileSize}x${tileSize})`, tiledMultiply, A, B, tileSize); | |
| results.push(r3); | |
| console.log(`3. Tiled (${tileSize}): ${r3.time.toFixed(3)}s (${(baseTime / r3.time).toFixed(2)}x)`); | |
| // Test 4: Unrolled | |
| const r4 = benchmark('Unrolled 4x', unrolled4Multiply, A, B); | |
| results.push(r4); | |
| console.log(`4. Unrolled 4x: ${r4.time.toFixed(3)}s (${(baseTime / r4.time).toFixed(2)}x)`); | |
| // Test 5: Flat arrays (TypedArray) | |
| const r5 = benchmark('Flat Float32Array', flatArrayMultiply, A_flat, B_flat, size, size, size); | |
| results.push(r5); | |
| console.log(`5. Flat Float32Array: ${r5.time.toFixed(3)}s (${(baseTime / r5.time).toFixed(2)}x)`); | |
| // Find best | |
| const best = results.reduce((a, b) => (a.time < b.time ? a : b)); | |
| const baseline = results.find(r => r.name === 'Reordered (ikj)'); | |
| console.log(`\n✨ Winner: ${best.name} (${best.time.toFixed(3)}s)`); | |
| console.log(`📊 Speedup vs Reordered: ${(baseline.time / best.time).toFixed(2)}x\n`); | |
| } | |
| } | |
| // Run the tests | |
| runTests(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <time.h> | |
| #include <math.h> | |
| #include <pthread.h> | |
| #include <stdbool.h> | |
| #include <string.h> | |
| #include <stdatomic.h> // Minimal synchronization: Atomic counter for job completion | |
| #include <sched.h> // For sched_yield() | |
| #include "cblas.h" | |
| // Check for NEON SIMD | |
| #ifdef __ARM_NEON | |
| #include <arm_neon.h> | |
| #define USE_NEON_SIMD 1 | |
| #else | |
| #define USE_NEON_SIMD 0 | |
| #endif | |
| // --- Constants and Global Settings --- | |
| #define TILE_SIZE 64 | |
| #define STRASSEN_CUTOFF 64 | |
| #define MAX_THREADS 8 | |
| #define VECTOR_LANE_COUNT 4 // Floats per NEON register | |
| // --- Type Definitions --- | |
| // Function signature for the execution block of work (used by workers) | |
| typedef void (*mm_block_func)(const float *A, const float *B, float *C, int N, int K, int M, int row_start, int row_end, int tile_size); | |
| // Function signature for the top-level benchmark runner (used by main) | |
| typedef void (*mm_func)(const float *__restrict, const float *__restrict, float *__restrict, int, int, int, int, const float *__restrict); | |
| /** | |
| * @brief Arguments passed to each persistent worker thread. | |
| */ | |
| typedef struct MatmulTaskArgs { | |
| int thread_id; | |
| const float *A; | |
| const float *B; | |
| float *C; | |
| int N, K, M; | |
| int row_start; | |
| int row_end; | |
| int tile_size; | |
| mm_block_func base_func; | |
| } MatmulTaskArgs; | |
| // --- Global Thread Pool (Minimal Synchronization using Atomicity) --- | |
| pthread_t g_workers[MAX_THREADS]; | |
| MatmulTaskArgs g_args[MAX_THREADS]; | |
| // Shared state for all workers to check for a new job or shutdown. | |
| atomic_bool g_running; // Set to false for shutdown | |
| atomic_int g_workers_ready; // Initial count: workers are ready to enter the job loop | |
| atomic_int g_workers_completed; // Counts workers finished the current job | |
| atomic_int g_job_counter; // THE FIX: Incremented by main thread to signal a NEW job | |
| int g_num_workers = 0; // Actual number of threads created | |
| // --- Utility Functions (Declarations) --- | |
| int min(int a, int b); | |
| float *generate_random_matrix(int rows, int cols); | |
| float *transpose_matrix(const float *B, int K, int M); | |
| double benchmark_runner(const char *name, mm_func mm_function, const float *A, const float *B, const float *B_T, int N, int K, int M, int tile_size); | |
| void multiply_naive_ijk_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size); | |
| void multiply_reordered_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size); | |
| void multiply_tiled_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int TILE_SIZE_ARG); | |
| void multiply_optimized_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size); | |
| void multiply_simd_transposed_block(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size); | |
| // Sequential wrappers (Declarations) | |
| void multiply_v2_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T); | |
| void multiply_v3_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T); | |
| void multiply_v11_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T); | |
| void multiply_cblas_sgemm_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T); | |
| void multiply_strassen_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T); | |
| // --- Thread Pool Worker Function --- | |
| /** | |
| * @brief The persistent worker thread loop. Waits for g_job_counter to increment. | |
| */ | |
| void* thread_pool_worker(void *arg) { | |
| MatmulTaskArgs *w_args = (MatmulTaskArgs *)arg; | |
| int local_job_id = 0; // Tracks the last job ID this worker processed | |
| // Initial signal: Tell the main thread this worker is ready to wait. | |
| atomic_fetch_add(&g_workers_ready, 1); | |
| while (atomic_load(&g_running)) { | |
| // --- 1. WAIT FOR NEW JOB SIGNAL --- | |
| // Wait until the global job counter is higher than the last job we processed. | |
| while (local_job_id == atomic_load(&g_job_counter) && atomic_load(&g_running)) { | |
| sched_yield(); // Yield CPU time | |
| } | |
| if (!atomic_load(&g_running)) break; | |
| // Get the new job ID BEFORE executing the task | |
| local_job_id = atomic_load(&g_job_counter); | |
| // --- 2. EXECUTE TASK (Arguments are guaranteed to be updated by main thread) --- | |
| if (w_args->base_func) { | |
| w_args->base_func(w_args->A, w_args->B, w_args->C, w_args->N, w_args->K, w_args->M, | |
| w_args->row_start, w_args->row_end, w_args->tile_size); | |
| } | |
| // --- 3. SIGNAL COMPLETION --- | |
| atomic_fetch_add(&g_workers_completed, 1); | |
| } | |
| return NULL; | |
| } | |
| // --- Thread Pool Management Functions --- | |
| /** | |
| * @brief Initializes the thread pool: creates and starts all worker threads. | |
| */ | |
| void initialize_thread_pool(int N) { | |
| g_num_workers = min(N, MAX_THREADS); | |
| // Initialize atomic variables | |
| atomic_store(&g_running, true); | |
| atomic_init(&g_workers_ready, 0); | |
| atomic_init(&g_workers_completed, 0); | |
| atomic_init(&g_job_counter, 0); | |
| for (int i = 0; i < g_num_workers; i++) { | |
| g_args[i].thread_id = i; | |
| g_args[i].base_func = NULL; | |
| if (pthread_create(&g_workers[i], NULL, thread_pool_worker, &g_args[i]) != 0) { | |
| perror("Failed to create worker thread"); | |
| g_num_workers = i; | |
| break; | |
| } | |
| } | |
| // Wait for all workers to reach their initial waiting state | |
| while (atomic_load(&g_workers_ready) < g_num_workers) { | |
| sched_yield(); | |
| } | |
| } | |
| /** | |
| * @brief Cleans up the thread pool: signals shutdown and joins all workers. | |
| */ | |
| void cleanup_thread_pool() { | |
| // Signal all workers to shut down | |
| atomic_store(&g_running, false); | |
| // Increment job counter one last time to wake any sleeping threads so they can read the shutdown signal | |
| atomic_fetch_add(&g_job_counter, 1); | |
| for (int i = 0; i < g_num_workers; i++) { | |
| pthread_join(g_workers[i], NULL); | |
| } | |
| g_num_workers = 0; | |
| } | |
| /** | |
| * @brief Dispatches a job to the cached thread pool and waits for completion. | |
| */ | |
| void dispatch_job(mm_block_func block_func, const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size) { | |
| if (g_num_workers == 0) return; | |
| // --- 1. SETUP JOB & WORK PARTITIONS --- | |
| int current_row = 0; | |
| int rows_per_thread = N / g_num_workers; | |
| int remaining_rows = N % g_num_workers; | |
| for (int i = 0; i < g_num_workers; i++) { | |
| MatmulTaskArgs *args = &g_args[i]; | |
| int row_start = current_row; | |
| int rows_to_process = rows_per_thread + (i < remaining_rows ? 1 : 0); | |
| int row_end = row_start + rows_to_process; | |
| // Update the arguments for the running worker threads | |
| args->A = A; | |
| args->B = B; | |
| args->C = C; | |
| args->N = N; | |
| args->K = K; | |
| args->M = M; | |
| args->row_start = row_start; | |
| args->row_end = row_end; | |
| args->tile_size = tile_size; | |
| args->base_func = block_func; | |
| current_row = row_end; | |
| } | |
| // --- 2. SIGNAL WORK START --- | |
| // Reset the completion counter BEFORE signaling the workers | |
| atomic_store(&g_workers_completed, 0); | |
| // Increment the job counter. This is the atomic signal that wakes the workers. | |
| atomic_fetch_add(&g_job_counter, 1); | |
| // --- 3. WAIT FOR COMPLETION --- | |
| // Wait until the completion counter reaches the total worker count | |
| while (atomic_load(&g_workers_completed) < g_num_workers) { | |
| sched_yield(); | |
| } | |
| } | |
| // --- Wrapper Functions (Warnings suppressed) --- | |
| void shared_parallel_wrapper(mm_block_func block_func, const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)B_T; | |
| dispatch_job(block_func, A, B, C, N, K, M, tile_size); | |
| } | |
| void multiply_v1_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; | |
| shared_parallel_wrapper(multiply_naive_ijk_block, A, B, C, N, K, M, 0, B_T); | |
| } | |
| void multiply_v2_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; | |
| shared_parallel_wrapper(multiply_reordered_ikj_block, A, B, C, N, K, M, 0, B_T); | |
| } | |
| void multiply_v3_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| shared_parallel_wrapper(multiply_tiled_block, A, B, C, N, K, M, tile_size, B_T); | |
| } | |
| void multiply_v7_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; | |
| shared_parallel_wrapper(multiply_optimized_ikj_block, A, B, C, N, K, M, 0, B_T); | |
| } | |
| void multiply_v11_parallel_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)B; | |
| (void)tile_size; | |
| // Pass B_T as the B argument for the transposed block function | |
| dispatch_job(multiply_simd_transposed_block, A, B_T, C, N, K, M, 0); | |
| } | |
| /** | |
| * @brief Wrapper for OpenBLAS cblas_sgemm (V9 in the overall set). | |
| */ | |
| void multiply_cblas_sgemm_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; (void)B_T; | |
| // Set to RowMajor order: C(N, M) = A(N, K) * B(K, M) | |
| // Transpose operations are handled internally by cblas_sgemm, | |
| // we assume standard row-major storage (CblasRowMajor). | |
| cblas_sgemm(CblasRowMajor, // Layout | |
| CblasNoTrans, // Transpose A? No. | |
| CblasNoTrans, // Transpose B? No. | |
| N, // Number of rows of A and C (N) | |
| M, // Number of columns of B and C (M) | |
| K, // Number of columns of A and rows of B (K) | |
| 1.0f, // Alpha (Scaling factor for A * B) | |
| A, // Matrix A | |
| K, // Leading dimension of A (K) | |
| B, // Matrix B | |
| M, // Leading dimension of B (M) | |
| 0.0f, // Beta (Scaling factor for C) | |
| C, // Matrix C | |
| M); // Leading dimension of C (M) | |
| } | |
| // --- Main Program (Driver) --- | |
| int main() { | |
| srand((unsigned int)time(NULL)); | |
| printf("C17 Matrix Multiplication Benchmark (11 Versions)\n"); | |
| printf("====================================================\n"); | |
| printf(" *** FIXED: Lock-Free Race Condition Eliminated ***\n"); | |
| printf(" *** Using an Atomic Job Counter for Reliable Synchronization ***\n"); | |
| printf("====================================================\n"); | |
| int sizes[] = {256, 512, 1024, 2048}; | |
| int num_sizes = sizeof(sizes) / sizeof(sizes[0]); | |
| for (int s = 0; s < num_sizes; s++) { | |
| int size = sizes[s]; | |
| int N = size; | |
| int K = size; | |
| int M = size; | |
| initialize_thread_pool(N); | |
| printf("======================================================================\n"); | |
| printf("Matrix size: %d x %d | Workers Used: %d | TILE_SIZE: %d\n", | |
| size, size, g_num_workers, TILE_SIZE); | |
| printf("SIMD Status: %s\n", USE_NEON_SIMD ? "NEON Active (Manual Intrinsic)" : "SIMD Disabled (C fallback)"); | |
| printf("======================================================================\n"); | |
| float *A = generate_random_matrix(N, K); | |
| float *B = generate_random_matrix(K, M); | |
| float *B_T = transpose_matrix(B, K, M); | |
| printf("\n🔥 Benchmark (5 measured, best time):\n\n"); | |
| double sequential_baseline_time = 0.0; | |
| // 2. Reordered (ikj) - Sequential (V2) - BASELINE | |
| sequential_baseline_time = benchmark_runner("2. Reordered (ikj) Sequential", multiply_v2_sequential_wrapper, A, B, B_T, N, K, M, 0); | |
| printf(" 2. Reordered (ikj) Sequential: %.3fs <--- BASELINE\n", sequential_baseline_time); | |
| // --- Parallel Implementations --- | |
| // 7. Reordered (ikj) Parallel | |
| double v7_time = benchmark_runner("7. Reordered (ikj) Parallel: ", multiply_v2_parallel_wrapper, A, B, B_T, N, K, M, 0); | |
| printf(" 7. Reordered (ikj) Parallel: %.3fs (%.2fx)\n", v7_time, sequential_baseline_time / v7_time); | |
| // 8. Tiled (ijk) Parallel (V3) | |
| double v8_time = benchmark_runner("8. Tiled (ijk) Parallel: ", multiply_v3_parallel_wrapper, A, B, B_T, N, K, M, TILE_SIZE); | |
| printf(" 8. Tiled (ijk) Parallel: %.3fs (%.2fx)\n", v8_time, sequential_baseline_time / v8_time); | |
| // 9. OpenBLAS cblas_sgemm (Library Standard) | |
| double v9_time = benchmark_runner("9. OpenBLAS cblas_sgemm: ", multiply_cblas_sgemm_wrapper, A, B, B_T, N, K, M, 0); | |
| printf(" 9. OpenBLAS cblas_sgemm: %.3fs (%.2fx) - LIBRARY\n", v9_time, sequential_baseline_time / v9_time); | |
| // 11. SIMD + Transposed Parallel (V11) | |
| double v11_time = benchmark_runner("11. SIMD + Transposed Parallel: ", multiply_v11_parallel_wrapper, A, B, B_T, N, K, M, 0); | |
| printf("11. SIMD + Transposed Parallel: %.3fs (%.2fx) - NEON\n", v11_time, sequential_baseline_time / v11_time); | |
| // Clean up memory | |
| free(A); | |
| free(B); | |
| free(B_T); | |
| cleanup_thread_pool(); | |
| printf("\n"); | |
| } | |
| printf("====================================================\n"); | |
| printf("Benchmark Complete.\n"); | |
| return 0; | |
| } | |
| // --- Implementation details for Block Functions (Warnings suppressed) --- | |
| void multiply_naive_ijk_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) { | |
| (void)N; | |
| (void)tile_size; | |
| for (int i = row_start; i < row_end; i++) { | |
| for (int j = 0; j < M; j++) { | |
| float sum = 0.0f; | |
| for (int k = 0; k < K; k++) { sum += A[i * K + k] * B[k * M + j]; } | |
| C[i * M + j] = sum; | |
| } | |
| } | |
| } | |
| void multiply_reordered_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) { | |
| (void)N; | |
| (void)tile_size; | |
| for (int i = row_start; i < row_end; i++) { | |
| int c_row_offset = i * M; | |
| for (int k = 0; k < K; k++) { | |
| float a_val = A[i * K + k]; | |
| int b_row_offset = k * M; | |
| for (int j = 0; j < M; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; } | |
| } | |
| } | |
| } | |
| void multiply_tiled_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int TILE_SIZE_ARG) { | |
| (void)N; | |
| for (int i_tile = row_start; i_tile < row_end; i_tile += TILE_SIZE_ARG) { | |
| for (int j_tile = 0; j_tile < M; j_tile += TILE_SIZE_ARG) { | |
| for (int k_tile = 0; k_tile < K; k_tile += TILE_SIZE_ARG) { | |
| int i_end = min(i_tile + TILE_SIZE_ARG, row_end); | |
| int j_end = min(j_tile + TILE_SIZE_ARG, M); | |
| int k_end = min(k_tile + TILE_SIZE_ARG, K); | |
| for (int i = i_tile; i < i_end; i++) { | |
| int c_row_offset = i * M; | |
| for (int k = k_tile; k < k_end; k++) { | |
| float a_val = A[i * K + k]; | |
| int b_row_offset = k * M; | |
| for (int j = j_tile; j < j_end; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| void multiply_optimized_ikj_block(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) { | |
| (void)N; | |
| (void)tile_size; | |
| for (int i = row_start; i < row_end; i++) { | |
| int c_row_offset = i * M; | |
| for (int k = 0; k < K; k++) { | |
| float a_val = A[i * K + k]; | |
| int b_row_offset = k * M; | |
| int j = 0; | |
| // Unrolled loop (V7 uses this) | |
| for (; j + 4 <= M; j += 4) { | |
| C[c_row_offset + j + 0] += a_val * B[b_row_offset + j + 0]; | |
| C[c_row_offset + j + 1] += a_val * B[b_row_offset + j + 1]; | |
| C[c_row_offset + j + 2] += a_val * B[b_row_offset + j + 2]; | |
| C[c_row_offset + j + 3] += a_val * B[b_row_offset + j + 3]; | |
| } | |
| for (; j < M; j++) { C[c_row_offset + j] += a_val * B[b_row_offset + j]; } | |
| } | |
| } | |
| } | |
| void multiply_simd_transposed_block(const float *__restrict A, const float *__restrict B_T, float *__restrict C, int N, int K, int M, int row_start, int row_end, int tile_size) { | |
| (void)N; | |
| (void)tile_size; | |
| for (int i = row_start; i < row_end; i++) { | |
| const float *A_row = A + i * K; | |
| float *C_row = C + i * M; | |
| for (int j = 0; j < M; j++) { | |
| const float *B_T_row = B_T + j * K; | |
| float sum = 0.0f; | |
| int k = 0; | |
| #if USE_NEON_SIMD | |
| // NEON Vectorization (4 floats at a time) | |
| float32x4_t dot_sum_v = vdupq_n_f32(0.0f); | |
| for (; k + VECTOR_LANE_COUNT <= K; k += VECTOR_LANE_COUNT) { | |
| // Load 4 floats from A row | |
| float32x4_t a_v = vld1q_f32(A_row + k); | |
| // Load 4 floats from B_T row (which is contiguous after transpose) | |
| float32x4_t b_t_v = vld1q_f32(B_T_row + k); | |
| // Multiply-accumulate: dot_sum_v = (a_v * b_t_v) + dot_sum_v | |
| dot_sum_v = vmlaq_f32(dot_sum_v, a_v, b_t_v); | |
| } | |
| // Horizontal sum: Sum up the 4 elements in the accumulator vector | |
| // Use vpadd, vpadd, vgetq_lane, vget_low/high. | |
| // Simplified for modern compilers/intrinsics: | |
| float32x2_t sum_pair = vpadd_f32(vget_low_f32(dot_sum_v), vget_high_f32(dot_sum_v)); | |
| sum_pair = vpadd_f32(sum_pair, sum_pair); // Sum the two elements | |
| sum = vget_lane_f32(sum_pair, 0); | |
| #endif // USE_NEON_SIMD | |
| // Scalar Cleanup Loop (Handles remaining elements if K is not a multiple of 4) | |
| for (; k < K; k++) { | |
| sum += A_row[k] * B_T_row[k]; | |
| } | |
| C_row[j] = sum; | |
| } | |
| } | |
| } | |
| // --- Implementation details for Utility Functions (Sequential wrappers and others) --- | |
| void multiply_v1_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; (void)B_T; multiply_naive_ijk_block(A, B, C, N, K, M, 0, N, 0); | |
| } | |
| void multiply_v2_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; (void)B_T; multiply_reordered_ikj_block(A, B, C, N, K, M, 0, N, 0); | |
| } | |
| void multiply_v3_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)B_T; multiply_tiled_block(A, B, C, N, K, M, 0, N, tile_size); | |
| } | |
| void multiply_v7_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; (void)B_T; multiply_optimized_ikj_block(A, B, C, N, K, M, 0, N, 0); | |
| } | |
| void multiply_v11_sequential_wrapper(const float *A, const float *B, float *C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)tile_size; (void)B; multiply_simd_transposed_block(A, B_T, C, N, K, M, 0, N, 0); | |
| } | |
| void multiply_strassen_wrapper(const float *__restrict A, const float *__restrict B, float *__restrict C, int N, int K, int M, int tile_size, const float *B_T) { | |
| (void)A; (void)B; (void)C; (void)N; (void)K; (void)M; (void)tile_size; (void)B_T; | |
| } | |
| int min(int a, int b) { return (a < b) ? a : b; } | |
| float *generate_random_matrix(int rows, int cols) { | |
| // Ensuring 16-byte alignment for NEON | |
| float *matrix = NULL; | |
| size_t total_bytes = (size_t)rows * cols * sizeof(float); | |
| // Use posix_memalign for 16-byte alignment, required for efficient SIMD loads | |
| if (posix_memalign((void**)&matrix, 16, total_bytes) != 0) { | |
| matrix = (float *)malloc(total_bytes); | |
| if (matrix != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc (alignment may be suboptimal).\n"); | |
| } | |
| if (matrix == NULL) { perror("Memory allocation failed"); exit(EXIT_FAILURE); } | |
| for (size_t i = 0; i < (size_t)rows * cols; i++) { matrix[i] = (float)rand() / (float)RAND_MAX; } | |
| return matrix; | |
| } | |
| float *transpose_matrix(const float *B, int K, int M) { | |
| // Ensuring 16-byte alignment for NEON | |
| float *B_T = NULL; | |
| size_t total_bytes = (size_t)M * K * sizeof(float); | |
| if (posix_memalign((void**)&B_T, 16, total_bytes) != 0) { | |
| B_T = (float *)malloc(total_bytes); | |
| if (B_T != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc for transpose (alignment may be suboptimal).\n"); | |
| } | |
| if (B_T == NULL) { perror("Memory allocation failed"); exit(EXIT_FAILURE); } | |
| for (int i = 0; i < K; i++) { | |
| for (int j = 0; j < M; j++) { B_T[j * K + i] = B[i * M + j]; } | |
| } | |
| return B_T; | |
| } | |
| double benchmark_runner(const char *name, mm_func mm_function, const float *A, const float *B, const float *B_T, int N, int K, int M, int tile_size) { | |
| (void)name; | |
| double best_time = 0.0; | |
| float *C = NULL; | |
| size_t total_bytes = (size_t)N * M * sizeof(float); | |
| if (posix_memalign((void**)&C, 16, total_bytes) != 0) { | |
| C = (float *)malloc(total_bytes); | |
| if (C != NULL) fprintf(stderr, "Warning: posix_memalign failed, using malloc for C (alignment may be suboptimal).\n"); | |
| } | |
| if (C == NULL) { perror("Measurement memory allocation failed"); exit(EXIT_FAILURE); } | |
| for (int i = 0; i < 5; i++) { | |
| // Must clear C matrix before each run | |
| memset(C, 0, total_bytes); | |
| clock_t start = clock(); | |
| mm_function(A, B, C, N, K, M, tile_size, B_T); | |
| clock_t end = clock(); | |
| double elapsed_sec = (double)(end - start) / CLOCKS_PER_SEC; | |
| if (i == 0 || elapsed_sec < best_time) { | |
| best_time = elapsed_sec; | |
| } | |
| } | |
| free(C); | |
| return best_time; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public class VectorizationTest { | |
| public static void main(String[] args) { | |
| System.out.println("Testing Java Auto-Vectorization on M1\n"); | |
| int size = 2048; | |
| float[][] A = randomMatrix(size, size); | |
| float[][] B = randomMatrix(size, size); | |
| // Warmup - let JIT do its magic | |
| System.out.println("Warming up JIT compiler..."); | |
| for (int i = 0; i < 5; i++) { | |
| scalarVersion(A, B); | |
| vectorFriendly(A, B); | |
| } | |
| System.out.println("Warmup complete!\n"); | |
| // Test 1: Scalar-friendly (hard to vectorize) | |
| long start = System.nanoTime(); | |
| float[][] C1 = scalarVersion(A, B); | |
| double scalarTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("1. Scalar-friendly version: %.3f seconds\n", scalarTime); | |
| // Test 2: Vector-friendly (easy to vectorize) | |
| start = System.nanoTime(); | |
| float[][] C2 = vectorFriendly(A, B); | |
| double vectorTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("2. Vector-friendly version: %.3f seconds\n", vectorTime); | |
| System.out.printf("\nSpeedup: %.2fx\n", scalarTime / vectorTime); | |
| if (vectorTime < scalarTime * 0.9) { | |
| System.out.println("✅ Likely vectorized! JIT is working!"); | |
| } else { | |
| System.out.println("❌ No significant vectorization detected"); | |
| } | |
| // Test 3: Manual unrolling | |
| start = System.nanoTime(); | |
| float[][] C3 = manualUnroll4(A, B); | |
| double unrollTime = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("\n3. Manual unroll (4x): %.3f seconds (%.2fx)\n", unrollTime, scalarTime / unrollTime); | |
| // Test 4: Aggressive unrolling | |
| start = System.nanoTime(); | |
| float[][] C4 = manualUnroll8(A, B); | |
| double unroll8Time = (System.nanoTime() - start) / 1e9; | |
| System.out.printf("4. Manual unroll (8x): %.3f seconds (%.2fx)\n", unroll8Time, scalarTime / unroll8Time); | |
| } | |
| // Version 1: Hard to vectorize (dependencies) | |
| static float[][] scalarVersion(float[][] A, float[][] B) { | |
| int n = A.length; | |
| float[][] C = new float[n][n]; | |
| for (int i = 0; i < n; i++) { | |
| for (int j = 0; j < n; j++) { | |
| float sum = 0; | |
| for (int k = 0; k < n; k++) { | |
| sum += A[i][k] * B[k][j]; // Reduces to single value | |
| } | |
| C[i][j] = sum; | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 2: Easy to vectorize (independent operations) | |
| static float[][] vectorFriendly(float[][] A, float[][] B) { | |
| int n = A.length; | |
| float[][] C = new float[n][n]; | |
| for (int i = 0; i < n; i++) { | |
| for (int k = 0; k < n; k++) { | |
| float aVal = A[i][k]; | |
| // This inner loop is vectorizable! | |
| // Each C[i][j] update is independent | |
| for (int j = 0; j < n; j++) { | |
| C[i][j] += aVal * B[k][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 3: Manual unroll 4x | |
| static float[][] manualUnroll4(float[][] A, float[][] B) { | |
| int n = A.length; | |
| float[][] C = new float[n][n]; | |
| for (int i = 0; i < n; i++) { | |
| for (int k = 0; k < n; k++) { | |
| float aVal = A[i][k]; | |
| int j = 0; | |
| // Process 4 at a time | |
| for (; j <= n - 4; j += 4) { | |
| C[i][j] += aVal * B[k][j]; | |
| C[i][j + 1] += aVal * B[k][j + 1]; | |
| C[i][j + 2] += aVal * B[k][j + 2]; | |
| C[i][j + 3] += aVal * B[k][j + 3]; | |
| } | |
| // Remainder | |
| for (; j < n; j++) { | |
| C[i][j] += aVal * B[k][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| // Version 4: Manual unroll 8x | |
| static float[][] manualUnroll8(float[][] A, float[][] B) { | |
| int n = A.length; | |
| float[][] C = new float[n][n]; | |
| for (int i = 0; i < n; i++) { | |
| for (int k = 0; k < n; k++) { | |
| float aVal = A[i][k]; | |
| int j = 0; | |
| // Process 8 at a time | |
| for (; j <= n - 8; j += 8) { | |
| C[i][j] += aVal * B[k][j]; | |
| C[i][j + 1] += aVal * B[k][j + 1]; | |
| C[i][j + 2] += aVal * B[k][j + 2]; | |
| C[i][j + 3] += aVal * B[k][j + 3]; | |
| C[i][j + 4] += aVal * B[k][j + 4]; | |
| C[i][j + 5] += aVal * B[k][j + 5]; | |
| C[i][j + 6] += aVal * B[k][j + 6]; | |
| C[i][j + 7] += aVal * B[k][j + 7]; | |
| } | |
| // Remainder | |
| for (; j < n; j++) { | |
| C[i][j] += aVal * B[k][j]; | |
| } | |
| } | |
| } | |
| return C; | |
| } | |
| static float[][] randomMatrix(int rows, int cols) { | |
| float[][] matrix = new float[rows][cols]; | |
| for (int i = 0; i < rows; i++) { | |
| for (int j = 0; j < cols; j++) { | |
| matrix[i][j] = (float) Math.random(); | |
| } | |
| } | |
| return matrix; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment