Skip to content

Instantly share code, notes, and snippets.

@snowclipsed
Last active January 14, 2025 20:23
Show Gist options
  • Save snowclipsed/167c3a957179e176da724226022a9da2 to your computer and use it in GitHub Desktop.
Save snowclipsed/167c3a957179e176da724226022a9da2 to your computer and use it in GitHub Desktop.
matmul macos
const std = @import("std");
const builtin = @import("builtin");
const time = std.time;
const atomic = std.atomic;
const CACHE_LINE_SIZE: usize = atomic.cache_line;
const CHUNK_SIZE: usize = 1;
const NEON_ALIGNMENT = 16;
// For ARM NEON, we use 4-wide vectors as that's the native SIMD width
const Vec4f = @Vector(4, f32);
const ThreadLocalData = struct {
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
};
// Main matrix multiplication logic remains similar, but with NEON-specific microkernel
pub fn tiledMatMul(
allocator: std.mem.Allocator,
A: []const f32,
B: []const f32,
C: []f32,
M: usize,
N: usize,
K: usize,
T: usize,
V: usize,
) !void {
const tiles_M = (M + T - 1) / T;
const tiles_N = (N + T - 1) / T;
const total_tiles = tiles_M * tiles_N;
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
const num_threads = try std.Thread.getCpuCount();
// Allocate thread pool
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
defer thread_pool.deinit();
// Allocate local C buffers for each thread
const buffer_size = T * T;
var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads);
defer allocator.free(local_C_buffers);
// For NEON alignment
const aligned_T = (T + 3) & ~@as(usize, 3); // Align to 4-float boundary
// Create and spawn threads
for (0..num_threads) |thread_id| {
const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size];
const context = ThreadContext{
.A = A,
.B = B,
.C = C,
.M = M,
.N = N,
.K = K,
.T = T,
.V = V,
.aligned_T = aligned_T,
.tiles_M = tiles_M,
.tiles_N = tiles_N,
.total_tiles = total_tiles,
.shared_counter = &shared_data,
.local_C = thread_buffer,
};
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
}
// Wait for all threads to complete
for (thread_pool.items) |thread| thread.join();
}
const ThreadContext = struct {
A: []const f32,
B: []const f32,
C: []f32,
M: usize,
N: usize,
K: usize,
T: usize,
V: usize,
aligned_T: usize,
tiles_M: usize,
tiles_N: usize,
total_tiles: usize,
shared_counter: *ThreadLocalData,
local_C: []f32,
};
fn workerThread(ctx: ThreadContext) void {
const max_tile_size = 256; // Maximum supported tile size for ARM
var A_local: [max_tile_size * max_tile_size]f32 align(16) = undefined;
var B_local: [max_tile_size * max_tile_size]f32 align(16) = undefined;
if (ctx.T > max_tile_size) {
@panic("Tile size exceeds maximum supported size");
}
while (true) {
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
if (start_idx >= ctx.total_tiles) break;
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
var idx = start_idx;
while (idx < end_idx) : (idx += 1) {
const i = idx / ctx.tiles_N;
const j = idx % ctx.tiles_N;
const i_start = i * ctx.T;
const j_start = j * ctx.T;
const i_end = @min(i_start + ctx.T, ctx.M);
const j_end = @min(j_start + ctx.T, ctx.N);
// Reset local_C
@memset(ctx.local_C, 0);
var k: usize = 0;
while (k < ctx.K) : (k += ctx.T) {
const k_end = @min(k + ctx.T, ctx.K);
if (builtin.cpu.arch == .aarch64) {
microKernelNEON(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
} else {
microKernelScalar(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
}
}
// Update global C using NEON vectorized accumulation when possible
for (i_start..i_end) |ii| {
const row_offset = ii * ctx.N;
const local_row = (ii - i_start) * ctx.aligned_T;
var j_idx = j_start;
while (j_idx + 4 <= j_end) : (j_idx += 4) {
if (builtin.cpu.arch == .aarch64) {
// Load 4 elements at once using NEON
const vec_idx = j_idx - j_start;
const c_vec = Vec4f{
ctx.local_C[local_row + vec_idx],
ctx.local_C[local_row + vec_idx + 1],
ctx.local_C[local_row + vec_idx + 2],
ctx.local_C[local_row + vec_idx + 3],
};
const dest_vec = Vec4f{
ctx.C[row_offset + j_idx],
ctx.C[row_offset + j_idx + 1],
ctx.C[row_offset + j_idx + 2],
ctx.C[row_offset + j_idx + 3],
};
const result = dest_vec + c_vec;
// Store result back
for (0..4) |offset| {
ctx.C[row_offset + j_idx + offset] = result[offset];
}
} else {
// Fallback scalar path
for (0..4) |offset| {
ctx.C[row_offset + j_idx + offset] +=
ctx.local_C[local_row + (j_idx - j_start) + offset];
}
}
}
// Handle remaining elements
while (j_idx < j_end) : (j_idx += 1) {
ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)];
}
}
}
}
}
fn microKernelScalar(
ctx: ThreadContext,
A_local: []f32,
B_local: []f32,
i_start: usize,
j_start: usize,
k_start: usize,
i_end: usize,
j_end: usize,
k_end: usize,
) void {
const k_size = k_end - k_start;
const i_size = i_end - i_start;
const j_size = j_end - j_start;
// Load tiles
for (0..i_size) |i| {
const src_idx = (i_start + i) * ctx.K + k_start;
const local_idx = i * ctx.aligned_T;
for (0..k_size) |k| {
A_local[local_idx + k] = ctx.A[src_idx + k];
}
}
for (0..k_size) |k| {
const src_idx = (k_start + k) * ctx.N + j_start;
const local_idx = k * ctx.aligned_T;
for (0..j_size) |j| {
B_local[local_idx + j] = ctx.B[src_idx + j];
}
}
// Compute using scalar operations
for (0..i_size) |i| {
for (0..j_size) |j| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
}
ctx.local_C[i * ctx.aligned_T + j] += sum;
}
}
}
fn microKernelNEON(
ctx: ThreadContext,
A_local: []f32,
B_local: []f32,
i_start: usize,
j_start: usize,
k_start: usize,
i_end: usize,
j_end: usize,
k_end: usize,
) void {
const k_size = k_end - k_start;
const i_size = i_end - i_start;
const j_size = j_end - j_start;
// Load tiles with NEON alignment
for (0..i_size) |i| {
const src_idx = (i_start + i) * ctx.K + k_start;
const local_idx = i * ctx.aligned_T;
var k: usize = 0;
// Vector load where possible
while (k + 4 <= k_size) : (k += 4) {
const vec = Vec4f{
ctx.A[src_idx + k],
ctx.A[src_idx + k + 1],
ctx.A[src_idx + k + 2],
ctx.A[src_idx + k + 3],
};
for (0..4) |offset| {
A_local[local_idx + k + offset] = vec[offset];
}
}
// Handle remaining elements
while (k < k_size) : (k += 1) {
A_local[local_idx + k] = ctx.A[src_idx + k];
}
}
// Load B tile with NEON alignment
for (0..k_size) |k| {
const src_idx = (k_start + k) * ctx.N + j_start;
const local_idx = k * ctx.aligned_T;
var j: usize = 0;
// Vector load where possible
while (j + 4 <= j_size) : (j += 4) {
const vec = Vec4f{
ctx.B[src_idx + j],
ctx.B[src_idx + j + 1],
ctx.B[src_idx + j + 2],
ctx.B[src_idx + j + 3],
};
for (0..4) |offset| {
B_local[local_idx + j + offset] = vec[offset];
}
}
// Handle remaining elements
while (j < j_size) : (j += 1) {
B_local[local_idx + j] = ctx.B[src_idx + j];
}
}
// Process 4x4 blocks using NEON
var i: usize = 0;
while (i + 4 <= i_size) : (i += 4) {
var j: usize = 0;
while (j + 4 <= j_size) : (j += 4) {
var acc: [4][4]f32 align(16) = [_][4]f32{[_]f32{0} ** 4} ** 4;
for (0..k_size) |k| {
const a_vec = Vec4f{
A_local[(i + 0) * ctx.aligned_T + k],
A_local[(i + 1) * ctx.aligned_T + k],
A_local[(i + 2) * ctx.aligned_T + k],
A_local[(i + 3) * ctx.aligned_T + k],
};
const b_vec = Vec4f{
B_local[k * ctx.aligned_T + j],
B_local[k * ctx.aligned_T + j + 1],
B_local[k * ctx.aligned_T + j + 2],
B_local[k * ctx.aligned_T + j + 3],
};
// NEON FMA operations
inline for (0..4) |bi| {
const a_broadcast: Vec4f = @splat(a_vec[bi]);
const c_vec = Vec4f{
acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
};
const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec);
inline for (0..4) |bj| {
acc[bi][bj] = prod[bj];
}
}
}
// Store accumulated results
for (0..4) |bi| {
for (0..4) |bj| {
ctx.local_C[(i + bi) * ctx.aligned_T + (j + bj)] += acc[bi][bj];
}
}
}
// Handle remaining columns
while (j < j_size) : (j += 1) {
for (0..4) |bi| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[(i + bi) * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
}
ctx.local_C[(i + bi) * ctx.aligned_T + j] += sum;
}
}
}
// Handle remaining rows
while (i < i_size) : (i += 1) {
for (0..j_size) |j| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum);
}
ctx.local_C[i * ctx.aligned_T + j] += sum;
}
}
}
const CpuInfo = struct {
total_cores: u32,
simd_width: u32,
supports_peak_calc: bool = false,
max_freq: ?u64 = null,
is_arm: bool,
};
fn getSIMDWidth() u32 {
if (builtin.cpu.arch == .aarch64) {
// ARM NEON typically works with 128-bit vectors (4 x float32)
return 4;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
return 16;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
return 8;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
return 4;
} else {
return 1;
}
}
const SearchRange = struct {
t_min: usize,
t_max: usize,
t_step: usize,
v_sizes: []const usize,
};
pub const SearchResult = struct {
tile_size: usize,
vector_size: usize,
gflops: f64,
};
pub fn adaptiveGridSearch(
allocator: std.mem.Allocator,
M: usize,
N: usize,
K: usize,
initial_range: SearchRange,
refinement_levels: usize,
) !SearchResult {
var best_result = SearchResult{
.tile_size = 0,
.vector_size = 0,
.gflops = 0,
};
var current_range = initial_range;
for (0..refinement_levels) |level| {
std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step });
std.debug.print("----------------------------------------\n", .{});
// Search current range
var results = std.ArrayList(SearchResult).init(allocator);
defer results.deinit();
var t = current_range.t_min;
while (t <= current_range.t_max) : (t += current_range.t_step) {
// Skip invalid tile sizes
if (t > M or t > N or t > K) continue;
for (current_range.v_sizes) |v| {
if (t % v != 0) continue; // Skip if tile size not divisible by vector size
// Run multiple trials and take best
var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K);
for (0..2) |_| {
const trial = try benchmarkMatMul(allocator, t, v, M, N, K);
if (trial.gflops > best_trial.gflops) {
best_trial.gflops = trial.gflops;
}
}
try results.append(.{
.tile_size = t,
.vector_size = v,
.gflops = best_trial.gflops,
});
std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops });
}
}
// Find top performers
const top_k = 3; // Number of top results to consider
const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len));
defer allocator.free(top_results);
// Sort results by GFLOPS (descending)
std.sort.pdq(SearchResult, results.items, {}, struct {
fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
return a.gflops > b.gflops;
}
}.lessThan);
// Copy top results
@memcpy(top_results, results.items[0..top_results.len]);
// Update best overall result
if (results.items[0].gflops > best_result.gflops) {
best_result = results.items[0];
}
// Print top performers for this level
std.debug.print("\nTop performers at level {d}:\n", .{level});
for (top_results, 0..) |result, i| {
std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops });
}
// Refine search range around best result
if (level + 1 < refinement_levels) {
const best_T = top_results[0].tile_size;
const step = current_range.t_step / 2;
current_range = SearchRange{
.t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min,
.t_max = @min(best_T + step * 2, current_range.t_max),
.t_step = step,
.v_sizes = current_range.v_sizes,
};
}
}
return best_result;
}
pub fn benchmarkMatMul(
allocator: std.mem.Allocator,
tile_size: usize,
vector_size: usize,
M: usize,
N: usize,
K: usize,
) !SearchResult {
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
// Initialize matrices with random data
var rng = std.rand.DefaultPrng.init(0);
const random = rng.random();
for (A) |*val| val.* = random.float(f32);
for (B) |*val| val.* = random.float(f32);
@memset(C, 0);
// Warm up run
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
// Actual benchmark
const start_time = time.nanoTimestamp();
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
const end_time = time.nanoTimestamp();
const elapsed_ns: u64 = @intCast(end_time - start_time);
// Calculate GFLOPS
const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add
const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
const gflops = operations / (seconds * 1e9);
return SearchResult{
.tile_size = tile_size,
.vector_size = vector_size,
.gflops = gflops,
};
}
fn getCpuFrequencies() !CpuInfo {
var info = CpuInfo{
.simd_width = getSIMDWidth(),
.total_cores = @intCast(try std.Thread.getCpuCount()),
.is_arm = (builtin.cpu.arch == .aarch64),
};
if (builtin.os.tag == .macos) {
// On MacOS, we'll use conservative defaults since we can't easily get CPU frequency
if (info.is_arm) {
info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2
} else {
info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel
}
info.supports_peak_calc = true;
// Default frequencies if detection fails
if (info.max_freq == null) {
if (info.is_arm) {
info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2
} else {
info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel
}
info.supports_peak_calc = true;
std.debug.print("Warning: Could not detect CPU frequency. Using default for {s}\n", .{if (info.is_arm) "ARM" else "Intel"});
}
}
return info;
}
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
const sizes = [_][3]usize{
.{ 1024, 1024, 1024 },
// .{ 2048, 2048, 2048 },
// .{ 4096, 4096, 4096 },
};
const iterations = 5;
// Print system configuration
const cpu_info = try getCpuFrequencies();
std.debug.print("\nSystem Configuration:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM" else "x86"});
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores});
if (cpu_info.supports_peak_calc and cpu_info.max_freq != null) {
std.debug.print("CPU Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.max_freq.?)) / 1e9});
}
std.debug.print("SIMD Width: {}\n", .{cpu_info.simd_width});
std.debug.print("----------------------------------------\n\n", .{});
// Modified search range for ARM
const initial_range = SearchRange{
.t_min = 16, // Smaller minimum tile size for ARM
.t_max = 256, // Smaller maximum tile size due to different cache hierarchy
.t_step = 32,
.v_sizes = &[_]usize{4}, // Only use vector size 4 for NEON
};
for (sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
std.debug.print("\n========================================\n", .{});
std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K });
std.debug.print("========================================\n", .{});
const best_config = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4);
std.debug.print("\nBest configuration found:\n", .{});
std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size});
std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size});
std.debug.print("Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops});
}
// Print summary
std.debug.print("\n========================================\n", .{});
std.debug.print("Benchmark Summary:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM (NEON)" else "x86"});
std.debug.print("Tree search levels: 4\n", .{});
std.debug.print("Iterations: {d}\n", .{iterations});
std.debug.print("========================================\n\n", .{});
}
const std = @import("std");
const builtin = @import("builtin");
const atomic = std.atomic;
///////////////////////////////////////////////////////////////////////////////
// Configuration Constants
///////////////////////////////////////////////////////////////////////////////
/// Tile size optimized for Apple Silicon cache hierarchy:
/// - L1D cache: 128KB (M1/M2/M3)
/// - L2 cache: 4MB-12MB (varies by model)
/// Larger tile size than Intel due to bigger L1 cache
const T: usize = 256;
/// Vector width for NEON SIMD (128 bits = 4 floats)
const V: usize = 4;
/// Size of CPU cache line in bytes
const CACHE_LINE_SIZE: usize = atomic.cache_line;
/// Number of tiles per atomic operation
const CHUNK_SIZE: usize = 1;
/// Alignment for NEON operations (128-bit = 16 bytes)
const NEON_ALIGNMENT = 16;
/// Vector type for 4 32-bit floats (NEON register size)
const Vec4f = @Vector(4, f32);
///////////////////////////////////////////////////////////////////////////////
// Thread Management Structures
///////////////////////////////////////////////////////////////////////////////
const ThreadLocalData = struct {
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
};
const ThreadContext = struct {
A: []const f32,
B: []const f32,
C: []f32,
M: usize,
N: usize,
K: usize,
tiles_M: usize,
tiles_N: usize,
total_tiles: usize,
shared_counter: *ThreadLocalData,
};
///////////////////////////////////////////////////////////////////////////////
// Main Matrix Multiplication Function
///////////////////////////////////////////////////////////////////////////////
pub fn tiledMatMul(allocator: std.mem.Allocator, A: []const f32, B: []const f32, C: []f32, M: usize, N: usize, K: usize) !void {
const tiles_M = (M + T - 1) / T;
const tiles_N = (N + T - 1) / T;
const total_tiles = tiles_M * tiles_N;
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
// Apple Silicon chips have high-performance and efficiency cores
// Use performance cores count for better results
const num_threads = try std.Thread.getCpuCount() / 2;
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
defer thread_pool.deinit();
for (0..num_threads) |_| {
const context = ThreadContext{
.A = A,
.B = B,
.C = C,
.M = M,
.N = N,
.K = K,
.tiles_M = tiles_M,
.tiles_N = tiles_N,
.total_tiles = total_tiles,
.shared_counter = &shared_data,
};
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
}
for (thread_pool.items) |thread| thread.join();
}
///////////////////////////////////////////////////////////////////////////////
// Worker Thread Implementation
///////////////////////////////////////////////////////////////////////////////
fn workerThread(ctx: ThreadContext) void {
var local_C: [T][T]f32 align(NEON_ALIGNMENT) = undefined;
while (true) {
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
if (start_idx >= ctx.total_tiles) break;
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
var idx = start_idx;
while (idx < end_idx) : (idx += 1) {
const i = idx / ctx.tiles_N;
const j = idx % ctx.tiles_N;
const i_start = i * T;
const j_start = j * T;
const i_end = @min(i_start + T, ctx.M);
const j_end = @min(j_start + T, ctx.N);
for (0..T) |x| {
@memset(&local_C[x], 0);
}
var k: usize = 0;
while (k < ctx.K) : (k += T) {
const k_end = @min(k + T, ctx.K);
microKernelNEON(ctx, &local_C, i_start, j_start, k, i_end, j_end, k_end);
}
// Write results back using NEON vectorized stores
for (i_start..i_end) |ii| {
const row_offset = ii * ctx.N;
const local_row = ii - i_start;
var j_idx = j_start;
// Process 4 elements at a time using NEON
while (j_idx + 4 <= j_end) : (j_idx += 4) {
const vec_idx = j_idx - j_start;
// Load 4 elements into NEON vector
const c_vec = Vec4f{
local_C[local_row][vec_idx],
local_C[local_row][vec_idx + 1],
local_C[local_row][vec_idx + 2],
local_C[local_row][vec_idx + 3],
};
// Load destination elements
const dest_vec = Vec4f{
ctx.C[row_offset + j_idx],
ctx.C[row_offset + j_idx + 1],
ctx.C[row_offset + j_idx + 2],
ctx.C[row_offset + j_idx + 3],
};
// Add vectors and store results
const result = dest_vec + c_vec;
for (0..4) |offset| {
ctx.C[row_offset + j_idx + offset] = result[offset];
}
}
// Handle remaining elements
while (j_idx < j_end) : (j_idx += 1) {
ctx.C[row_offset + j_idx] += local_C[local_row][j_idx - j_start];
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Micro-Kernel Implementation
///////////////////////////////////////////////////////////////////////////////
fn microKernelNEON(
ctx: ThreadContext,
local_C: *[T][T]f32,
i_start: usize,
j_start: usize,
k_start: usize,
i_end: usize,
j_end: usize,
k_end: usize,
) void {
var A_local: [T][T]f32 align(NEON_ALIGNMENT) = undefined;
var B_local: [T][T]f32 align(NEON_ALIGNMENT) = undefined;
const k_size = k_end - k_start;
const i_size = i_end - i_start;
const j_size = j_end - j_start;
// Load tiles with prefetching
for (0..i_size) |i| {
const src_idx = (i_start + i) * ctx.K + k_start;
for (0..k_size) |k| {
A_local[i][k] = ctx.A[src_idx + k];
}
}
for (0..k_size) |k| {
const src_idx = (k_start + k) * ctx.N + j_start;
for (0..j_size) |j| {
B_local[k][j] = ctx.B[src_idx + j];
}
}
// Process 4x4 blocks using NEON SIMD
var i: usize = 0;
while (i + 4 <= i_size) : (i += 4) {
var j: usize = 0;
while (j + 4 <= j_size) : (j += 4) {
var acc: [4][4]f32 align(NEON_ALIGNMENT) = [_][4]f32{[_]f32{0} ** 4} ** 4;
// Inner loop with NEON operations
var k: usize = 0;
while (k < k_size) : (k += 1) {
// Load 4 elements from A matrix
const a_vec = Vec4f{
A_local[i][k],
A_local[i + 1][k],
A_local[i + 2][k],
A_local[i + 3][k],
};
// Load 4 elements from B matrix
const b_vec = Vec4f{
B_local[k][j],
B_local[k][j + 1],
B_local[k][j + 2],
B_local[k][j + 3],
};
// Vectorized outer product
inline for (0..4) |bi| {
const a_broadcast: Vec4f = @splat(a_vec[bi]);
const c_vec = Vec4f{
acc[bi][0],
acc[bi][1],
acc[bi][2],
acc[bi][3],
};
const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec);
inline for (0..4) |bj| {
acc[bi][bj] = prod[bj];
}
}
}
// Store results
for (0..4) |bi| {
for (0..4) |bj| {
local_C[i + bi][j + bj] += acc[bi][bj];
}
}
}
// Handle edge case columns
while (j < j_size) : (j += 1) {
for (0..4) |bi| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i + bi][k], B_local[k][j], sum);
}
local_C[i + bi][j] += sum;
}
}
}
// Handle edge case rows
while (i < i_size) : (i += 1) {
for (0..j_size) |j| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i][k], B_local[k][j], sum);
}
local_C[i][j] += sum;
}
}
}
/// ------ BENCHMARKING AND PERFORMANCE ANALYSIS ------ ///
const CpuInfo = struct {
p_freq: ?u64 = null,
e_freq: ?u64 = null,
p_cores: ?u32 = null,
e_cores: ?u32 = null,
simd_width: u32,
total_cores: u32,
supports_peak_calc: bool,
};
fn getSIMDWidth() u32 {
if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
return 16;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
return 8;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
return 4;
} else {
return 1;
}
}
fn getCpuFrequencies() !CpuInfo {
var info = CpuInfo{
.simd_width = getSIMDWidth(),
.total_cores = @intCast(try std.Thread.getCpuCount()),
.supports_peak_calc = false,
};
switch (builtin.os.tag) {
.linux => {
// Try reading hybrid architecture core mapping
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| {
defer p_core_file.close();
var buf: [32]u8 = undefined;
if (p_core_file.reader().readAll(&buf)) |bytes_read| {
const p_cores_str = buf[0..bytes_read];
// Count cores from range (e.g. "0-7" means 8 threads)
if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| {
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
}
} else |_| {}
} else |_| {}
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| {
defer e_core_file.close();
var buf: [32]u8 = undefined;
if (e_core_file.reader().readAll(&buf)) |bytes_read| {
const e_cores_str = buf[0..bytes_read];
// Count cores from range (e.g. "8-15" means 8 threads)
if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| {
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
}
} else |_| {}
} else |_| {}
// Try reading frequencies for P-cores
if (info.p_cores != null) {
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| {
defer freq_file.close();
var buf: [32]u8 = undefined;
if (freq_file.reader().readAll(&buf)) |bytes_read| {
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
info.p_freq = freq * 1000; // Convert kHz to Hz
} else |_| {
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
}
} else |_| {}
} else |_| {
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
}
}
// Try reading frequencies for E-cores
if (info.e_cores != null) {
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| {
defer freq_file.close();
var buf: [32]u8 = undefined;
if (freq_file.reader().readAll(&buf)) |bytes_read| {
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
info.e_freq = freq * 1000; // Convert kHz to Hz
} else |_| {
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
}
} else |_| {}
} else |_| {
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
}
}
// If we detected any cores, we can do peak calculation
info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
(info.e_cores != null and info.e_freq != null);
// If we couldn't detect anything, fall back to treating all cores as P-cores
if (!info.supports_peak_calc) {
info.p_cores = info.total_cores;
info.e_cores = 0;
info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
info.supports_peak_calc = true;
std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
}
},
.windows, .macos => {
info.p_cores = info.total_cores;
info.e_cores = 0;
info.supports_peak_calc = false;
},
else => {
@compileError("Unsupported operating system");
},
}
return info;
}
fn calculateTheoreticalPeak(info: CpuInfo) ?f64 {
if (!info.supports_peak_calc) {
return null;
}
const p_core_gflops = if (info.p_freq != null and info.p_cores != null)
(@as(f64, @floatFromInt(info.p_freq.?)) / 1e9) *
@as(f64, @floatFromInt(info.p_cores.? * info.simd_width * 2))
else
0;
const e_core_gflops = if (info.e_freq != null and info.e_cores != null)
(@as(f64, @floatFromInt(info.e_freq.?)) / 1e9) *
@as(f64, @floatFromInt(info.e_cores.? * info.simd_width * 2))
else
0;
return p_core_gflops + e_core_gflops;
}
pub fn calculatePreciseGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usize, iterations: usize) !void {
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
// Initialize matrices
var rng = std.rand.DefaultPrng.init(@intCast(std.time.timestamp()));
const random = rng.random();
for (A) |*val| {
val.* = random.float(f32) * 2.0 - 1.0;
}
for (B) |*val| {
val.* = random.float(f32) * 2.0 - 1.0;
}
@memset(C, 0);
// Warmup run
try tiledMatMul(allocator, A, B, C, M, N, K);
// Track performance metrics
const ops_per_element = 2 * K - 1;
const total_elements = M * N;
const flops_per_iteration = total_elements * ops_per_element;
var metrics = try allocator.alloc(struct {
gflops: f64,
time_ms: f64,
}, iterations);
defer allocator.free(metrics);
// Main measurement loop
var timer = try std.time.Timer.start();
const total_start = timer.read();
for (0..iterations) |i| {
@memset(C, 0);
const iter_start = timer.read();
try tiledMatMul(allocator, A, B, C, M, N, K);
const iter_end = timer.read();
const iter_time_ns = iter_end - iter_start;
const iter_time_ms = @as(f64, @floatFromInt(iter_time_ns)) / 1e6;
const iter_gflops = @as(f64, @floatFromInt(flops_per_iteration)) / (@as(f64, @floatFromInt(iter_time_ns)) / 1e9) / 1e9;
metrics[i] = .{
.gflops = iter_gflops,
.time_ms = iter_time_ms,
};
}
// Calculate statistics
const total_time_ns = timer.read() - total_start;
const total_time_sec = @as(f64, @floatFromInt(total_time_ns)) / 1e9;
var min_gflops: f64 = metrics[0].gflops;
var max_gflops: f64 = metrics[0].gflops;
var total_gflops: f64 = 0;
var first_quarter_avg: f64 = 0;
var last_quarter_avg: f64 = 0;
const quarter_size = iterations / 4;
for (metrics, 0..) |m, i| {
min_gflops = @min(min_gflops, m.gflops);
max_gflops = @max(max_gflops, m.gflops);
total_gflops += m.gflops;
if (i < quarter_size) {
first_quarter_avg += m.gflops;
} else if (i >= iterations - quarter_size) {
last_quarter_avg += m.gflops;
}
}
first_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
last_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
const avg_gflops = total_gflops / @as(f64, @floatFromInt(iterations));
const perf_change_pct = ((last_quarter_avg - first_quarter_avg) / first_quarter_avg) * 100.0;
// Get CPU info and theoretical peak
const cpu_info = try getCpuFrequencies();
// Print detailed performance report
std.debug.print("\nPerformance Analysis for {d}x{d}x{d} Matrix:\n", .{ M, N, K });
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Peak Performance: {d:.2} GFLOPS\n", .{max_gflops});
std.debug.print("Lowest Performance: {d:.2} GFLOPS\n", .{min_gflops});
std.debug.print("Average Performance: {d:.2} GFLOPS\n", .{avg_gflops});
std.debug.print("First Quarter Avg: {d:.2} GFLOPS\n", .{first_quarter_avg});
std.debug.print("Last Quarter Avg: {d:.2} GFLOPS\n", .{last_quarter_avg});
if (perf_change_pct >= 0) {
std.debug.print("Performance Change: {d:.1}% (improvement)\n", .{perf_change_pct});
} else {
std.debug.print("Performance Change: {d:.1}% (degradation)\n", .{perf_change_pct});
}
std.debug.print("Total Runtime: {d:.3} seconds\n", .{total_time_sec});
// CPU information and theoretical peak
std.debug.print("\nCPU Configuration:\n", .{});
std.debug.print("Total Cores: {d}\n", .{cpu_info.total_cores});
if (cpu_info.p_cores != null) {
std.debug.print("P-Cores: {d}\n", .{cpu_info.p_cores.?});
}
if (cpu_info.e_cores != null) {
std.debug.print("E-Cores: {d}\n", .{cpu_info.e_cores.?});
}
std.debug.print("SIMD Width: {d} floats\n", .{cpu_info.simd_width});
if (cpu_info.supports_peak_calc) {
if (cpu_info.p_freq != null) {
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
}
if (cpu_info.e_freq != null) {
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
}
if (calculateTheoreticalPeak(cpu_info)) |peak| {
std.debug.print("\nTheoretical Peak: {d:.2} GFLOPS\n", .{peak});
if (peak > 0) {
std.debug.print("Peak Efficiency: {d:.1}% of theoretical\n", .{(max_gflops / peak) * 100.0});
}
}
} else {
std.debug.print("\nNote: Theoretical peak calculation not supported on this OS\n", .{});
}
// Print performance distribution
std.debug.print("\nPerformance Distribution:\n", .{});
const distribution_width = 60;
for (metrics) |m| {
const normalized = (m.gflops - min_gflops) / (max_gflops - min_gflops);
const stars = @as(usize, @intFromFloat(normalized * @as(f64, @floatFromInt(distribution_width))));
std.debug.print("{d:.2} GFLOPS |", .{m.gflops});
for (0..stars) |_| {
std.debug.print("*", .{});
}
std.debug.print("\n", .{});
}
}
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
const sizes = [_][3]usize{
// Uncomment additional sizes as needed
.{ 256, 256, 256 },
.{ 512, 512, 512 },
.{ 1024, 1024, 1024 },
.{ 1024, 2048, 1024 },
.{ 1500, 1500, 1500 },
.{ 2048, 2048, 2048 },
};
const iterations = 10;
// Print system configuration
const cpu_info = try getCpuFrequencies();
std.debug.print("\nSystem Configuration:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Tile size (T): {}\n", .{T});
std.debug.print("Vector size (V): {}\n", .{V});
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores});
if (cpu_info.supports_peak_calc) {
if (cpu_info.p_freq != null) {
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
}
if (cpu_info.e_freq != null) {
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
}
}
std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec4f) / @sizeOf(f32)});
std.debug.print("Iterations per test: {}\n", .{iterations});
std.debug.print("----------------------------------------\n\n", .{});
// Run benchmarks with increasing matrix sizes
for (sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
std.debug.print("\n========================================\n", .{});
std.debug.print("Running benchmark for {d}x{d}x{d} matrices\n", .{ M, N, K });
std.debug.print("========================================\n", .{});
try calculatePreciseGflops(allocator, M, N, K, iterations);
// Add a small delay between tests to let CPU cool down
std.time.sleep(1 * std.time.ns_per_s);
}
// Print summary
std.debug.print("\n========================================\n", .{});
std.debug.print("Benchmark Summary:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
std.debug.print("Total iterations per size: {d}\n", .{iterations});
std.debug.print("AVX2 SIMD utilized: Yes\n", .{});
std.debug.print("Tiled algorithm: Yes ({}x{})\n", .{ T, T });
std.debug.print("========================================\n\n", .{});
}
// Additional test for edge cases
test "tiledMatMul edge cases" {
const allocator = std.testing.allocator;
// Test with sizes that are not multiples of tile size
const edge_sizes = [_][3]usize{
.{ T + 1, T - 1, T + 2 }, // Around tile size
.{ T * 2 - 1, T * 2 + 1, T * 2 }, // Around double tile size
.{ 1, T, T }, // Single row
.{ T, 1, T }, // Single column
.{ T, T, 1 }, // Single depth
};
for (edge_sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
const C_ref = try allocator.alloc(f32, M * N);
defer allocator.free(C_ref);
// Initialize matrices
for (A, 0..) |*val, i| {
val.* = @floatFromInt(i % 7);
}
for (B, 0..) |*val, i| {
val.* = @floatFromInt((i + 1) % 5);
}
@memset(C, 0);
@memset(C_ref, 0);
// Perform multiplications
try tiledMatMul(allocator, A, B, C, M, N, K);
// Reference calculation
for (0..M) |i| {
for (0..N) |j| {
var sum: f32 = 0;
for (0..K) |k| {
sum += A[i * K + k] * B[k * N + j];
}
C_ref[i * N + j] = sum;
}
}
// Compare results with higher tolerance for edge cases
for (C, C_ref) |c, c_ref| {
try std.testing.expectApproxEqAbs(c, c_ref, 1e-4);
}
std.debug.print("Edge case test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
}
}
// Example usage
test "matrix multiplication" {
const allocator = std.testing.allocator;
const M: usize = 1024;
const N: usize = 1024;
const K: usize = 1024;
var A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
var B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
// Initialize matrices
for (0..M * K) |i| {
A[i] = @floatFromInt(i % 7);
}
for (0..K * N) |i| {
B[i] = @floatFromInt(i % 5);
}
@memset(C, 0);
try tiledMatMul(allocator, A, B, C, M, N, K);
}
// Test function
test "tiledMatMul correctness" {
const allocator = std.testing.allocator;
const test_sizes = [_][3]usize{
.{ 128, 128, 128 },
.{ 100, 100, 100 },
.{ 200, 150, 175 },
.{ 32, 64, 48 },
.{ 47, 34, 45 },
};
for (test_sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
const C_ref = try allocator.alloc(f32, M * N);
defer allocator.free(C_ref);
// Initialize matrices
for (A, 0..) |*val, i| {
val.* = @floatFromInt(i % 10);
}
for (B, 0..) |*val, i| {
val.* = @floatFromInt((i + 1) % 10);
}
@memset(C, 0);
@memset(C_ref, 0);
// Perform tiled matrix multiplication
try tiledMatMul(allocator, A, B, C, M, N, K);
// Perform reference matrix multiplication
for (0..M) |i| {
for (0..N) |j| {
var sum: f32 = 0;
for (0..K) |k| {
sum += A[i * K + k] * B[k * N + j];
}
C_ref[i * N + j] = sum;
}
}
// Compare results
for (C, C_ref) |c, c_ref| {
try std.testing.expectApproxEqAbs(c, c_ref, 1e-5);
}
std.debug.print("Test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment