Last active
January 14, 2025 20:23
-
-
Save snowclipsed/167c3a957179e176da724226022a9da2 to your computer and use it in GitHub Desktop.
matmul macos
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const builtin = @import("builtin"); | |
const time = std.time; | |
const atomic = std.atomic; | |
const CACHE_LINE_SIZE: usize = atomic.cache_line; | |
const CHUNK_SIZE: usize = 1; | |
const NEON_ALIGNMENT = 16; | |
// For ARM NEON, we use 4-wide vectors as that's the native SIMD width | |
const Vec4f = @Vector(4, f32); | |
const ThreadLocalData = struct { | |
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE), | |
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined, | |
}; | |
// Main matrix multiplication logic remains similar, but with NEON-specific microkernel | |
pub fn tiledMatMul( | |
allocator: std.mem.Allocator, | |
A: []const f32, | |
B: []const f32, | |
C: []f32, | |
M: usize, | |
N: usize, | |
K: usize, | |
T: usize, | |
V: usize, | |
) !void { | |
const tiles_M = (M + T - 1) / T; | |
const tiles_N = (N + T - 1) / T; | |
const total_tiles = tiles_M * tiles_N; | |
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) }; | |
const num_threads = try std.Thread.getCpuCount(); | |
// Allocate thread pool | |
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads); | |
defer thread_pool.deinit(); | |
// Allocate local C buffers for each thread | |
const buffer_size = T * T; | |
var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads); | |
defer allocator.free(local_C_buffers); | |
// For NEON alignment | |
const aligned_T = (T + 3) & ~@as(usize, 3); // Align to 4-float boundary | |
// Create and spawn threads | |
for (0..num_threads) |thread_id| { | |
const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size]; | |
const context = ThreadContext{ | |
.A = A, | |
.B = B, | |
.C = C, | |
.M = M, | |
.N = N, | |
.K = K, | |
.T = T, | |
.V = V, | |
.aligned_T = aligned_T, | |
.tiles_M = tiles_M, | |
.tiles_N = tiles_N, | |
.total_tiles = total_tiles, | |
.shared_counter = &shared_data, | |
.local_C = thread_buffer, | |
}; | |
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context})); | |
} | |
// Wait for all threads to complete | |
for (thread_pool.items) |thread| thread.join(); | |
} | |
const ThreadContext = struct { | |
A: []const f32, | |
B: []const f32, | |
C: []f32, | |
M: usize, | |
N: usize, | |
K: usize, | |
T: usize, | |
V: usize, | |
aligned_T: usize, | |
tiles_M: usize, | |
tiles_N: usize, | |
total_tiles: usize, | |
shared_counter: *ThreadLocalData, | |
local_C: []f32, | |
}; | |
fn workerThread(ctx: ThreadContext) void { | |
const max_tile_size = 256; // Maximum supported tile size for ARM | |
var A_local: [max_tile_size * max_tile_size]f32 align(16) = undefined; | |
var B_local: [max_tile_size * max_tile_size]f32 align(16) = undefined; | |
if (ctx.T > max_tile_size) { | |
@panic("Tile size exceeds maximum supported size"); | |
} | |
while (true) { | |
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst); | |
if (start_idx >= ctx.total_tiles) break; | |
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles); | |
var idx = start_idx; | |
while (idx < end_idx) : (idx += 1) { | |
const i = idx / ctx.tiles_N; | |
const j = idx % ctx.tiles_N; | |
const i_start = i * ctx.T; | |
const j_start = j * ctx.T; | |
const i_end = @min(i_start + ctx.T, ctx.M); | |
const j_end = @min(j_start + ctx.T, ctx.N); | |
// Reset local_C | |
@memset(ctx.local_C, 0); | |
var k: usize = 0; | |
while (k < ctx.K) : (k += ctx.T) { | |
const k_end = @min(k + ctx.T, ctx.K); | |
if (builtin.cpu.arch == .aarch64) { | |
microKernelNEON(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end); | |
} else { | |
microKernelScalar(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end); | |
} | |
} | |
// Update global C using NEON vectorized accumulation when possible | |
for (i_start..i_end) |ii| { | |
const row_offset = ii * ctx.N; | |
const local_row = (ii - i_start) * ctx.aligned_T; | |
var j_idx = j_start; | |
while (j_idx + 4 <= j_end) : (j_idx += 4) { | |
if (builtin.cpu.arch == .aarch64) { | |
// Load 4 elements at once using NEON | |
const vec_idx = j_idx - j_start; | |
const c_vec = Vec4f{ | |
ctx.local_C[local_row + vec_idx], | |
ctx.local_C[local_row + vec_idx + 1], | |
ctx.local_C[local_row + vec_idx + 2], | |
ctx.local_C[local_row + vec_idx + 3], | |
}; | |
const dest_vec = Vec4f{ | |
ctx.C[row_offset + j_idx], | |
ctx.C[row_offset + j_idx + 1], | |
ctx.C[row_offset + j_idx + 2], | |
ctx.C[row_offset + j_idx + 3], | |
}; | |
const result = dest_vec + c_vec; | |
// Store result back | |
for (0..4) |offset| { | |
ctx.C[row_offset + j_idx + offset] = result[offset]; | |
} | |
} else { | |
// Fallback scalar path | |
for (0..4) |offset| { | |
ctx.C[row_offset + j_idx + offset] += | |
ctx.local_C[local_row + (j_idx - j_start) + offset]; | |
} | |
} | |
} | |
// Handle remaining elements | |
while (j_idx < j_end) : (j_idx += 1) { | |
ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)]; | |
} | |
} | |
} | |
} | |
} | |
fn microKernelScalar( | |
ctx: ThreadContext, | |
A_local: []f32, | |
B_local: []f32, | |
i_start: usize, | |
j_start: usize, | |
k_start: usize, | |
i_end: usize, | |
j_end: usize, | |
k_end: usize, | |
) void { | |
const k_size = k_end - k_start; | |
const i_size = i_end - i_start; | |
const j_size = j_end - j_start; | |
// Load tiles | |
for (0..i_size) |i| { | |
const src_idx = (i_start + i) * ctx.K + k_start; | |
const local_idx = i * ctx.aligned_T; | |
for (0..k_size) |k| { | |
A_local[local_idx + k] = ctx.A[src_idx + k]; | |
} | |
} | |
for (0..k_size) |k| { | |
const src_idx = (k_start + k) * ctx.N + j_start; | |
const local_idx = k * ctx.aligned_T; | |
for (0..j_size) |j| { | |
B_local[local_idx + j] = ctx.B[src_idx + j]; | |
} | |
} | |
// Compute using scalar operations | |
for (0..i_size) |i| { | |
for (0..j_size) |j| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum); | |
} | |
ctx.local_C[i * ctx.aligned_T + j] += sum; | |
} | |
} | |
} | |
fn microKernelNEON( | |
ctx: ThreadContext, | |
A_local: []f32, | |
B_local: []f32, | |
i_start: usize, | |
j_start: usize, | |
k_start: usize, | |
i_end: usize, | |
j_end: usize, | |
k_end: usize, | |
) void { | |
const k_size = k_end - k_start; | |
const i_size = i_end - i_start; | |
const j_size = j_end - j_start; | |
// Load tiles with NEON alignment | |
for (0..i_size) |i| { | |
const src_idx = (i_start + i) * ctx.K + k_start; | |
const local_idx = i * ctx.aligned_T; | |
var k: usize = 0; | |
// Vector load where possible | |
while (k + 4 <= k_size) : (k += 4) { | |
const vec = Vec4f{ | |
ctx.A[src_idx + k], | |
ctx.A[src_idx + k + 1], | |
ctx.A[src_idx + k + 2], | |
ctx.A[src_idx + k + 3], | |
}; | |
for (0..4) |offset| { | |
A_local[local_idx + k + offset] = vec[offset]; | |
} | |
} | |
// Handle remaining elements | |
while (k < k_size) : (k += 1) { | |
A_local[local_idx + k] = ctx.A[src_idx + k]; | |
} | |
} | |
// Load B tile with NEON alignment | |
for (0..k_size) |k| { | |
const src_idx = (k_start + k) * ctx.N + j_start; | |
const local_idx = k * ctx.aligned_T; | |
var j: usize = 0; | |
// Vector load where possible | |
while (j + 4 <= j_size) : (j += 4) { | |
const vec = Vec4f{ | |
ctx.B[src_idx + j], | |
ctx.B[src_idx + j + 1], | |
ctx.B[src_idx + j + 2], | |
ctx.B[src_idx + j + 3], | |
}; | |
for (0..4) |offset| { | |
B_local[local_idx + j + offset] = vec[offset]; | |
} | |
} | |
// Handle remaining elements | |
while (j < j_size) : (j += 1) { | |
B_local[local_idx + j] = ctx.B[src_idx + j]; | |
} | |
} | |
// Process 4x4 blocks using NEON | |
var i: usize = 0; | |
while (i + 4 <= i_size) : (i += 4) { | |
var j: usize = 0; | |
while (j + 4 <= j_size) : (j += 4) { | |
var acc: [4][4]f32 align(16) = [_][4]f32{[_]f32{0} ** 4} ** 4; | |
for (0..k_size) |k| { | |
const a_vec = Vec4f{ | |
A_local[(i + 0) * ctx.aligned_T + k], | |
A_local[(i + 1) * ctx.aligned_T + k], | |
A_local[(i + 2) * ctx.aligned_T + k], | |
A_local[(i + 3) * ctx.aligned_T + k], | |
}; | |
const b_vec = Vec4f{ | |
B_local[k * ctx.aligned_T + j], | |
B_local[k * ctx.aligned_T + j + 1], | |
B_local[k * ctx.aligned_T + j + 2], | |
B_local[k * ctx.aligned_T + j + 3], | |
}; | |
// NEON FMA operations | |
inline for (0..4) |bi| { | |
const a_broadcast: Vec4f = @splat(a_vec[bi]); | |
const c_vec = Vec4f{ | |
acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3], | |
}; | |
const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec); | |
inline for (0..4) |bj| { | |
acc[bi][bj] = prod[bj]; | |
} | |
} | |
} | |
// Store accumulated results | |
for (0..4) |bi| { | |
for (0..4) |bj| { | |
ctx.local_C[(i + bi) * ctx.aligned_T + (j + bj)] += acc[bi][bj]; | |
} | |
} | |
} | |
// Handle remaining columns | |
while (j < j_size) : (j += 1) { | |
for (0..4) |bi| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[(i + bi) * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum); | |
} | |
ctx.local_C[(i + bi) * ctx.aligned_T + j] += sum; | |
} | |
} | |
} | |
// Handle remaining rows | |
while (i < i_size) : (i += 1) { | |
for (0..j_size) |j| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i * ctx.aligned_T + k], B_local[k * ctx.aligned_T + j], sum); | |
} | |
ctx.local_C[i * ctx.aligned_T + j] += sum; | |
} | |
} | |
} | |
const CpuInfo = struct { | |
total_cores: u32, | |
simd_width: u32, | |
supports_peak_calc: bool = false, | |
max_freq: ?u64 = null, | |
is_arm: bool, | |
}; | |
fn getSIMDWidth() u32 { | |
if (builtin.cpu.arch == .aarch64) { | |
// ARM NEON typically works with 128-bit vectors (4 x float32) | |
return 4; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) { | |
return 16; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) { | |
return 8; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) { | |
return 4; | |
} else { | |
return 1; | |
} | |
} | |
const SearchRange = struct { | |
t_min: usize, | |
t_max: usize, | |
t_step: usize, | |
v_sizes: []const usize, | |
}; | |
pub const SearchResult = struct { | |
tile_size: usize, | |
vector_size: usize, | |
gflops: f64, | |
}; | |
pub fn adaptiveGridSearch( | |
allocator: std.mem.Allocator, | |
M: usize, | |
N: usize, | |
K: usize, | |
initial_range: SearchRange, | |
refinement_levels: usize, | |
) !SearchResult { | |
var best_result = SearchResult{ | |
.tile_size = 0, | |
.vector_size = 0, | |
.gflops = 0, | |
}; | |
var current_range = initial_range; | |
for (0..refinement_levels) |level| { | |
std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step }); | |
std.debug.print("----------------------------------------\n", .{}); | |
// Search current range | |
var results = std.ArrayList(SearchResult).init(allocator); | |
defer results.deinit(); | |
var t = current_range.t_min; | |
while (t <= current_range.t_max) : (t += current_range.t_step) { | |
// Skip invalid tile sizes | |
if (t > M or t > N or t > K) continue; | |
for (current_range.v_sizes) |v| { | |
if (t % v != 0) continue; // Skip if tile size not divisible by vector size | |
// Run multiple trials and take best | |
var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K); | |
for (0..2) |_| { | |
const trial = try benchmarkMatMul(allocator, t, v, M, N, K); | |
if (trial.gflops > best_trial.gflops) { | |
best_trial.gflops = trial.gflops; | |
} | |
} | |
try results.append(.{ | |
.tile_size = t, | |
.vector_size = v, | |
.gflops = best_trial.gflops, | |
}); | |
std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops }); | |
} | |
} | |
// Find top performers | |
const top_k = 3; // Number of top results to consider | |
const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len)); | |
defer allocator.free(top_results); | |
// Sort results by GFLOPS (descending) | |
std.sort.pdq(SearchResult, results.items, {}, struct { | |
fn lessThan(_: void, a: SearchResult, b: SearchResult) bool { | |
return a.gflops > b.gflops; | |
} | |
}.lessThan); | |
// Copy top results | |
@memcpy(top_results, results.items[0..top_results.len]); | |
// Update best overall result | |
if (results.items[0].gflops > best_result.gflops) { | |
best_result = results.items[0]; | |
} | |
// Print top performers for this level | |
std.debug.print("\nTop performers at level {d}:\n", .{level}); | |
for (top_results, 0..) |result, i| { | |
std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops }); | |
} | |
// Refine search range around best result | |
if (level + 1 < refinement_levels) { | |
const best_T = top_results[0].tile_size; | |
const step = current_range.t_step / 2; | |
current_range = SearchRange{ | |
.t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min, | |
.t_max = @min(best_T + step * 2, current_range.t_max), | |
.t_step = step, | |
.v_sizes = current_range.v_sizes, | |
}; | |
} | |
} | |
return best_result; | |
} | |
pub fn benchmarkMatMul( | |
allocator: std.mem.Allocator, | |
tile_size: usize, | |
vector_size: usize, | |
M: usize, | |
N: usize, | |
K: usize, | |
) !SearchResult { | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
// Initialize matrices with random data | |
var rng = std.rand.DefaultPrng.init(0); | |
const random = rng.random(); | |
for (A) |*val| val.* = random.float(f32); | |
for (B) |*val| val.* = random.float(f32); | |
@memset(C, 0); | |
// Warm up run | |
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size); | |
// Actual benchmark | |
const start_time = time.nanoTimestamp(); | |
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size); | |
const end_time = time.nanoTimestamp(); | |
const elapsed_ns: u64 = @intCast(end_time - start_time); | |
// Calculate GFLOPS | |
const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add | |
const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9; | |
const gflops = operations / (seconds * 1e9); | |
return SearchResult{ | |
.tile_size = tile_size, | |
.vector_size = vector_size, | |
.gflops = gflops, | |
}; | |
} | |
fn getCpuFrequencies() !CpuInfo { | |
var info = CpuInfo{ | |
.simd_width = getSIMDWidth(), | |
.total_cores = @intCast(try std.Thread.getCpuCount()), | |
.is_arm = (builtin.cpu.arch == .aarch64), | |
}; | |
if (builtin.os.tag == .macos) { | |
// On MacOS, we'll use conservative defaults since we can't easily get CPU frequency | |
if (info.is_arm) { | |
info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2 | |
} else { | |
info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel | |
} | |
info.supports_peak_calc = true; | |
// Default frequencies if detection fails | |
if (info.max_freq == null) { | |
if (info.is_arm) { | |
info.max_freq = 3_200_000_000; // 3.2 GHz default for M1/M2 | |
} else { | |
info.max_freq = 3_000_000_000; // 3.0 GHz default for Intel | |
} | |
info.supports_peak_calc = true; | |
std.debug.print("Warning: Could not detect CPU frequency. Using default for {s}\n", .{if (info.is_arm) "ARM" else "Intel"}); | |
} | |
} | |
return info; | |
} | |
pub fn main() !void { | |
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | |
defer arena.deinit(); | |
const allocator = arena.allocator(); | |
const sizes = [_][3]usize{ | |
.{ 1024, 1024, 1024 }, | |
// .{ 2048, 2048, 2048 }, | |
// .{ 4096, 4096, 4096 }, | |
}; | |
const iterations = 5; | |
// Print system configuration | |
const cpu_info = try getCpuFrequencies(); | |
std.debug.print("\nSystem Configuration:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM" else "x86"}); | |
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores}); | |
if (cpu_info.supports_peak_calc and cpu_info.max_freq != null) { | |
std.debug.print("CPU Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.max_freq.?)) / 1e9}); | |
} | |
std.debug.print("SIMD Width: {}\n", .{cpu_info.simd_width}); | |
std.debug.print("----------------------------------------\n\n", .{}); | |
// Modified search range for ARM | |
const initial_range = SearchRange{ | |
.t_min = 16, // Smaller minimum tile size for ARM | |
.t_max = 256, // Smaller maximum tile size due to different cache hierarchy | |
.t_step = 32, | |
.v_sizes = &[_]usize{4}, // Only use vector size 4 for NEON | |
}; | |
for (sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K }); | |
std.debug.print("========================================\n", .{}); | |
const best_config = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4); | |
std.debug.print("\nBest configuration found:\n", .{}); | |
std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size}); | |
std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size}); | |
std.debug.print("Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops}); | |
} | |
// Print summary | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Benchmark Summary:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len}); | |
std.debug.print("Architecture: {s}\n", .{if (cpu_info.is_arm) "ARM (NEON)" else "x86"}); | |
std.debug.print("Tree search levels: 4\n", .{}); | |
std.debug.print("Iterations: {d}\n", .{iterations}); | |
std.debug.print("========================================\n\n", .{}); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const builtin = @import("builtin"); | |
const atomic = std.atomic; | |
/////////////////////////////////////////////////////////////////////////////// | |
// Configuration Constants | |
/////////////////////////////////////////////////////////////////////////////// | |
/// Tile size optimized for Apple Silicon cache hierarchy: | |
/// - L1D cache: 128KB (M1/M2/M3) | |
/// - L2 cache: 4MB-12MB (varies by model) | |
/// Larger tile size than Intel due to bigger L1 cache | |
const T: usize = 256; | |
/// Vector width for NEON SIMD (128 bits = 4 floats) | |
const V: usize = 4; | |
/// Size of CPU cache line in bytes | |
const CACHE_LINE_SIZE: usize = atomic.cache_line; | |
/// Number of tiles per atomic operation | |
const CHUNK_SIZE: usize = 1; | |
/// Alignment for NEON operations (128-bit = 16 bytes) | |
const NEON_ALIGNMENT = 16; | |
/// Vector type for 4 32-bit floats (NEON register size) | |
const Vec4f = @Vector(4, f32); | |
/////////////////////////////////////////////////////////////////////////////// | |
// Thread Management Structures | |
/////////////////////////////////////////////////////////////////////////////// | |
const ThreadLocalData = struct { | |
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE), | |
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined, | |
}; | |
const ThreadContext = struct { | |
A: []const f32, | |
B: []const f32, | |
C: []f32, | |
M: usize, | |
N: usize, | |
K: usize, | |
tiles_M: usize, | |
tiles_N: usize, | |
total_tiles: usize, | |
shared_counter: *ThreadLocalData, | |
}; | |
/////////////////////////////////////////////////////////////////////////////// | |
// Main Matrix Multiplication Function | |
/////////////////////////////////////////////////////////////////////////////// | |
pub fn tiledMatMul(allocator: std.mem.Allocator, A: []const f32, B: []const f32, C: []f32, M: usize, N: usize, K: usize) !void { | |
const tiles_M = (M + T - 1) / T; | |
const tiles_N = (N + T - 1) / T; | |
const total_tiles = tiles_M * tiles_N; | |
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) }; | |
// Apple Silicon chips have high-performance and efficiency cores | |
// Use performance cores count for better results | |
const num_threads = try std.Thread.getCpuCount() / 2; | |
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads); | |
defer thread_pool.deinit(); | |
for (0..num_threads) |_| { | |
const context = ThreadContext{ | |
.A = A, | |
.B = B, | |
.C = C, | |
.M = M, | |
.N = N, | |
.K = K, | |
.tiles_M = tiles_M, | |
.tiles_N = tiles_N, | |
.total_tiles = total_tiles, | |
.shared_counter = &shared_data, | |
}; | |
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context})); | |
} | |
for (thread_pool.items) |thread| thread.join(); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// Worker Thread Implementation | |
/////////////////////////////////////////////////////////////////////////////// | |
fn workerThread(ctx: ThreadContext) void { | |
var local_C: [T][T]f32 align(NEON_ALIGNMENT) = undefined; | |
while (true) { | |
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst); | |
if (start_idx >= ctx.total_tiles) break; | |
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles); | |
var idx = start_idx; | |
while (idx < end_idx) : (idx += 1) { | |
const i = idx / ctx.tiles_N; | |
const j = idx % ctx.tiles_N; | |
const i_start = i * T; | |
const j_start = j * T; | |
const i_end = @min(i_start + T, ctx.M); | |
const j_end = @min(j_start + T, ctx.N); | |
for (0..T) |x| { | |
@memset(&local_C[x], 0); | |
} | |
var k: usize = 0; | |
while (k < ctx.K) : (k += T) { | |
const k_end = @min(k + T, ctx.K); | |
microKernelNEON(ctx, &local_C, i_start, j_start, k, i_end, j_end, k_end); | |
} | |
// Write results back using NEON vectorized stores | |
for (i_start..i_end) |ii| { | |
const row_offset = ii * ctx.N; | |
const local_row = ii - i_start; | |
var j_idx = j_start; | |
// Process 4 elements at a time using NEON | |
while (j_idx + 4 <= j_end) : (j_idx += 4) { | |
const vec_idx = j_idx - j_start; | |
// Load 4 elements into NEON vector | |
const c_vec = Vec4f{ | |
local_C[local_row][vec_idx], | |
local_C[local_row][vec_idx + 1], | |
local_C[local_row][vec_idx + 2], | |
local_C[local_row][vec_idx + 3], | |
}; | |
// Load destination elements | |
const dest_vec = Vec4f{ | |
ctx.C[row_offset + j_idx], | |
ctx.C[row_offset + j_idx + 1], | |
ctx.C[row_offset + j_idx + 2], | |
ctx.C[row_offset + j_idx + 3], | |
}; | |
// Add vectors and store results | |
const result = dest_vec + c_vec; | |
for (0..4) |offset| { | |
ctx.C[row_offset + j_idx + offset] = result[offset]; | |
} | |
} | |
// Handle remaining elements | |
while (j_idx < j_end) : (j_idx += 1) { | |
ctx.C[row_offset + j_idx] += local_C[local_row][j_idx - j_start]; | |
} | |
} | |
} | |
} | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// Micro-Kernel Implementation | |
/////////////////////////////////////////////////////////////////////////////// | |
fn microKernelNEON( | |
ctx: ThreadContext, | |
local_C: *[T][T]f32, | |
i_start: usize, | |
j_start: usize, | |
k_start: usize, | |
i_end: usize, | |
j_end: usize, | |
k_end: usize, | |
) void { | |
var A_local: [T][T]f32 align(NEON_ALIGNMENT) = undefined; | |
var B_local: [T][T]f32 align(NEON_ALIGNMENT) = undefined; | |
const k_size = k_end - k_start; | |
const i_size = i_end - i_start; | |
const j_size = j_end - j_start; | |
// Load tiles with prefetching | |
for (0..i_size) |i| { | |
const src_idx = (i_start + i) * ctx.K + k_start; | |
for (0..k_size) |k| { | |
A_local[i][k] = ctx.A[src_idx + k]; | |
} | |
} | |
for (0..k_size) |k| { | |
const src_idx = (k_start + k) * ctx.N + j_start; | |
for (0..j_size) |j| { | |
B_local[k][j] = ctx.B[src_idx + j]; | |
} | |
} | |
// Process 4x4 blocks using NEON SIMD | |
var i: usize = 0; | |
while (i + 4 <= i_size) : (i += 4) { | |
var j: usize = 0; | |
while (j + 4 <= j_size) : (j += 4) { | |
var acc: [4][4]f32 align(NEON_ALIGNMENT) = [_][4]f32{[_]f32{0} ** 4} ** 4; | |
// Inner loop with NEON operations | |
var k: usize = 0; | |
while (k < k_size) : (k += 1) { | |
// Load 4 elements from A matrix | |
const a_vec = Vec4f{ | |
A_local[i][k], | |
A_local[i + 1][k], | |
A_local[i + 2][k], | |
A_local[i + 3][k], | |
}; | |
// Load 4 elements from B matrix | |
const b_vec = Vec4f{ | |
B_local[k][j], | |
B_local[k][j + 1], | |
B_local[k][j + 2], | |
B_local[k][j + 3], | |
}; | |
// Vectorized outer product | |
inline for (0..4) |bi| { | |
const a_broadcast: Vec4f = @splat(a_vec[bi]); | |
const c_vec = Vec4f{ | |
acc[bi][0], | |
acc[bi][1], | |
acc[bi][2], | |
acc[bi][3], | |
}; | |
const prod = @mulAdd(Vec4f, a_broadcast, b_vec, c_vec); | |
inline for (0..4) |bj| { | |
acc[bi][bj] = prod[bj]; | |
} | |
} | |
} | |
// Store results | |
for (0..4) |bi| { | |
for (0..4) |bj| { | |
local_C[i + bi][j + bj] += acc[bi][bj]; | |
} | |
} | |
} | |
// Handle edge case columns | |
while (j < j_size) : (j += 1) { | |
for (0..4) |bi| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i + bi][k], B_local[k][j], sum); | |
} | |
local_C[i + bi][j] += sum; | |
} | |
} | |
} | |
// Handle edge case rows | |
while (i < i_size) : (i += 1) { | |
for (0..j_size) |j| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i][k], B_local[k][j], sum); | |
} | |
local_C[i][j] += sum; | |
} | |
} | |
} | |
/// ------ BENCHMARKING AND PERFORMANCE ANALYSIS ------ /// | |
const CpuInfo = struct { | |
p_freq: ?u64 = null, | |
e_freq: ?u64 = null, | |
p_cores: ?u32 = null, | |
e_cores: ?u32 = null, | |
simd_width: u32, | |
total_cores: u32, | |
supports_peak_calc: bool, | |
}; | |
fn getSIMDWidth() u32 { | |
if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) { | |
return 16; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) { | |
return 8; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) { | |
return 4; | |
} else { | |
return 1; | |
} | |
} | |
fn getCpuFrequencies() !CpuInfo { | |
var info = CpuInfo{ | |
.simd_width = getSIMDWidth(), | |
.total_cores = @intCast(try std.Thread.getCpuCount()), | |
.supports_peak_calc = false, | |
}; | |
switch (builtin.os.tag) { | |
.linux => { | |
// Try reading hybrid architecture core mapping | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| { | |
defer p_core_file.close(); | |
var buf: [32]u8 = undefined; | |
if (p_core_file.reader().readAll(&buf)) |bytes_read| { | |
const p_cores_str = buf[0..bytes_read]; | |
// Count cores from range (e.g. "0-7" means 8 threads) | |
if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| { | |
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10); | |
const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10); | |
info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading | |
} | |
} else |_| {} | |
} else |_| {} | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| { | |
defer e_core_file.close(); | |
var buf: [32]u8 = undefined; | |
if (e_core_file.reader().readAll(&buf)) |bytes_read| { | |
const e_cores_str = buf[0..bytes_read]; | |
// Count cores from range (e.g. "8-15" means 8 threads) | |
if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| { | |
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10); | |
const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10); | |
info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading | |
} | |
} else |_| {} | |
} else |_| {} | |
// Try reading frequencies for P-cores | |
if (info.p_cores != null) { | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| { | |
defer freq_file.close(); | |
var buf: [32]u8 = undefined; | |
if (freq_file.reader().readAll(&buf)) |bytes_read| { | |
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| { | |
info.p_freq = freq * 1000; // Convert kHz to Hz | |
} else |_| { | |
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores | |
} | |
} else |_| {} | |
} else |_| { | |
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores | |
} | |
} | |
// Try reading frequencies for E-cores | |
if (info.e_cores != null) { | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| { | |
defer freq_file.close(); | |
var buf: [32]u8 = undefined; | |
if (freq_file.reader().readAll(&buf)) |bytes_read| { | |
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| { | |
info.e_freq = freq * 1000; // Convert kHz to Hz | |
} else |_| { | |
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores | |
} | |
} else |_| {} | |
} else |_| { | |
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores | |
} | |
} | |
// If we detected any cores, we can do peak calculation | |
info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or | |
(info.e_cores != null and info.e_freq != null); | |
// If we couldn't detect anything, fall back to treating all cores as P-cores | |
if (!info.supports_peak_calc) { | |
info.p_cores = info.total_cores; | |
info.e_cores = 0; | |
info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default | |
info.supports_peak_calc = true; | |
std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{}); | |
} | |
}, | |
.windows, .macos => { | |
info.p_cores = info.total_cores; | |
info.e_cores = 0; | |
info.supports_peak_calc = false; | |
}, | |
else => { | |
@compileError("Unsupported operating system"); | |
}, | |
} | |
return info; | |
} | |
fn calculateTheoreticalPeak(info: CpuInfo) ?f64 { | |
if (!info.supports_peak_calc) { | |
return null; | |
} | |
const p_core_gflops = if (info.p_freq != null and info.p_cores != null) | |
(@as(f64, @floatFromInt(info.p_freq.?)) / 1e9) * | |
@as(f64, @floatFromInt(info.p_cores.? * info.simd_width * 2)) | |
else | |
0; | |
const e_core_gflops = if (info.e_freq != null and info.e_cores != null) | |
(@as(f64, @floatFromInt(info.e_freq.?)) / 1e9) * | |
@as(f64, @floatFromInt(info.e_cores.? * info.simd_width * 2)) | |
else | |
0; | |
return p_core_gflops + e_core_gflops; | |
} | |
pub fn calculatePreciseGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usize, iterations: usize) !void { | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
// Initialize matrices | |
var rng = std.rand.DefaultPrng.init(@intCast(std.time.timestamp())); | |
const random = rng.random(); | |
for (A) |*val| { | |
val.* = random.float(f32) * 2.0 - 1.0; | |
} | |
for (B) |*val| { | |
val.* = random.float(f32) * 2.0 - 1.0; | |
} | |
@memset(C, 0); | |
// Warmup run | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
// Track performance metrics | |
const ops_per_element = 2 * K - 1; | |
const total_elements = M * N; | |
const flops_per_iteration = total_elements * ops_per_element; | |
var metrics = try allocator.alloc(struct { | |
gflops: f64, | |
time_ms: f64, | |
}, iterations); | |
defer allocator.free(metrics); | |
// Main measurement loop | |
var timer = try std.time.Timer.start(); | |
const total_start = timer.read(); | |
for (0..iterations) |i| { | |
@memset(C, 0); | |
const iter_start = timer.read(); | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
const iter_end = timer.read(); | |
const iter_time_ns = iter_end - iter_start; | |
const iter_time_ms = @as(f64, @floatFromInt(iter_time_ns)) / 1e6; | |
const iter_gflops = @as(f64, @floatFromInt(flops_per_iteration)) / (@as(f64, @floatFromInt(iter_time_ns)) / 1e9) / 1e9; | |
metrics[i] = .{ | |
.gflops = iter_gflops, | |
.time_ms = iter_time_ms, | |
}; | |
} | |
// Calculate statistics | |
const total_time_ns = timer.read() - total_start; | |
const total_time_sec = @as(f64, @floatFromInt(total_time_ns)) / 1e9; | |
var min_gflops: f64 = metrics[0].gflops; | |
var max_gflops: f64 = metrics[0].gflops; | |
var total_gflops: f64 = 0; | |
var first_quarter_avg: f64 = 0; | |
var last_quarter_avg: f64 = 0; | |
const quarter_size = iterations / 4; | |
for (metrics, 0..) |m, i| { | |
min_gflops = @min(min_gflops, m.gflops); | |
max_gflops = @max(max_gflops, m.gflops); | |
total_gflops += m.gflops; | |
if (i < quarter_size) { | |
first_quarter_avg += m.gflops; | |
} else if (i >= iterations - quarter_size) { | |
last_quarter_avg += m.gflops; | |
} | |
} | |
first_quarter_avg /= @as(f64, @floatFromInt(quarter_size)); | |
last_quarter_avg /= @as(f64, @floatFromInt(quarter_size)); | |
const avg_gflops = total_gflops / @as(f64, @floatFromInt(iterations)); | |
const perf_change_pct = ((last_quarter_avg - first_quarter_avg) / first_quarter_avg) * 100.0; | |
// Get CPU info and theoretical peak | |
const cpu_info = try getCpuFrequencies(); | |
// Print detailed performance report | |
std.debug.print("\nPerformance Analysis for {d}x{d}x{d} Matrix:\n", .{ M, N, K }); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Peak Performance: {d:.2} GFLOPS\n", .{max_gflops}); | |
std.debug.print("Lowest Performance: {d:.2} GFLOPS\n", .{min_gflops}); | |
std.debug.print("Average Performance: {d:.2} GFLOPS\n", .{avg_gflops}); | |
std.debug.print("First Quarter Avg: {d:.2} GFLOPS\n", .{first_quarter_avg}); | |
std.debug.print("Last Quarter Avg: {d:.2} GFLOPS\n", .{last_quarter_avg}); | |
if (perf_change_pct >= 0) { | |
std.debug.print("Performance Change: {d:.1}% (improvement)\n", .{perf_change_pct}); | |
} else { | |
std.debug.print("Performance Change: {d:.1}% (degradation)\n", .{perf_change_pct}); | |
} | |
std.debug.print("Total Runtime: {d:.3} seconds\n", .{total_time_sec}); | |
// CPU information and theoretical peak | |
std.debug.print("\nCPU Configuration:\n", .{}); | |
std.debug.print("Total Cores: {d}\n", .{cpu_info.total_cores}); | |
if (cpu_info.p_cores != null) { | |
std.debug.print("P-Cores: {d}\n", .{cpu_info.p_cores.?}); | |
} | |
if (cpu_info.e_cores != null) { | |
std.debug.print("E-Cores: {d}\n", .{cpu_info.e_cores.?}); | |
} | |
std.debug.print("SIMD Width: {d} floats\n", .{cpu_info.simd_width}); | |
if (cpu_info.supports_peak_calc) { | |
if (cpu_info.p_freq != null) { | |
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9}); | |
} | |
if (cpu_info.e_freq != null) { | |
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9}); | |
} | |
if (calculateTheoreticalPeak(cpu_info)) |peak| { | |
std.debug.print("\nTheoretical Peak: {d:.2} GFLOPS\n", .{peak}); | |
if (peak > 0) { | |
std.debug.print("Peak Efficiency: {d:.1}% of theoretical\n", .{(max_gflops / peak) * 100.0}); | |
} | |
} | |
} else { | |
std.debug.print("\nNote: Theoretical peak calculation not supported on this OS\n", .{}); | |
} | |
// Print performance distribution | |
std.debug.print("\nPerformance Distribution:\n", .{}); | |
const distribution_width = 60; | |
for (metrics) |m| { | |
const normalized = (m.gflops - min_gflops) / (max_gflops - min_gflops); | |
const stars = @as(usize, @intFromFloat(normalized * @as(f64, @floatFromInt(distribution_width)))); | |
std.debug.print("{d:.2} GFLOPS |", .{m.gflops}); | |
for (0..stars) |_| { | |
std.debug.print("*", .{}); | |
} | |
std.debug.print("\n", .{}); | |
} | |
} | |
pub fn main() !void { | |
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | |
defer arena.deinit(); | |
const allocator = arena.allocator(); | |
const sizes = [_][3]usize{ | |
// Uncomment additional sizes as needed | |
.{ 256, 256, 256 }, | |
.{ 512, 512, 512 }, | |
.{ 1024, 1024, 1024 }, | |
.{ 1024, 2048, 1024 }, | |
.{ 1500, 1500, 1500 }, | |
.{ 2048, 2048, 2048 }, | |
}; | |
const iterations = 10; | |
// Print system configuration | |
const cpu_info = try getCpuFrequencies(); | |
std.debug.print("\nSystem Configuration:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Tile size (T): {}\n", .{T}); | |
std.debug.print("Vector size (V): {}\n", .{V}); | |
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores}); | |
if (cpu_info.supports_peak_calc) { | |
if (cpu_info.p_freq != null) { | |
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9}); | |
} | |
if (cpu_info.e_freq != null) { | |
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9}); | |
} | |
} | |
std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec4f) / @sizeOf(f32)}); | |
std.debug.print("Iterations per test: {}\n", .{iterations}); | |
std.debug.print("----------------------------------------\n\n", .{}); | |
// Run benchmarks with increasing matrix sizes | |
for (sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Running benchmark for {d}x{d}x{d} matrices\n", .{ M, N, K }); | |
std.debug.print("========================================\n", .{}); | |
try calculatePreciseGflops(allocator, M, N, K, iterations); | |
// Add a small delay between tests to let CPU cool down | |
std.time.sleep(1 * std.time.ns_per_s); | |
} | |
// Print summary | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Benchmark Summary:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len}); | |
std.debug.print("Total iterations per size: {d}\n", .{iterations}); | |
std.debug.print("AVX2 SIMD utilized: Yes\n", .{}); | |
std.debug.print("Tiled algorithm: Yes ({}x{})\n", .{ T, T }); | |
std.debug.print("========================================\n\n", .{}); | |
} | |
// Additional test for edge cases | |
test "tiledMatMul edge cases" { | |
const allocator = std.testing.allocator; | |
// Test with sizes that are not multiples of tile size | |
const edge_sizes = [_][3]usize{ | |
.{ T + 1, T - 1, T + 2 }, // Around tile size | |
.{ T * 2 - 1, T * 2 + 1, T * 2 }, // Around double tile size | |
.{ 1, T, T }, // Single row | |
.{ T, 1, T }, // Single column | |
.{ T, T, 1 }, // Single depth | |
}; | |
for (edge_sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
const C_ref = try allocator.alloc(f32, M * N); | |
defer allocator.free(C_ref); | |
// Initialize matrices | |
for (A, 0..) |*val, i| { | |
val.* = @floatFromInt(i % 7); | |
} | |
for (B, 0..) |*val, i| { | |
val.* = @floatFromInt((i + 1) % 5); | |
} | |
@memset(C, 0); | |
@memset(C_ref, 0); | |
// Perform multiplications | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
// Reference calculation | |
for (0..M) |i| { | |
for (0..N) |j| { | |
var sum: f32 = 0; | |
for (0..K) |k| { | |
sum += A[i * K + k] * B[k * N + j]; | |
} | |
C_ref[i * N + j] = sum; | |
} | |
} | |
// Compare results with higher tolerance for edge cases | |
for (C, C_ref) |c, c_ref| { | |
try std.testing.expectApproxEqAbs(c, c_ref, 1e-4); | |
} | |
std.debug.print("Edge case test passed for size: M={}, N={}, K={}\n", .{ M, N, K }); | |
} | |
} | |
// Example usage | |
test "matrix multiplication" { | |
const allocator = std.testing.allocator; | |
const M: usize = 1024; | |
const N: usize = 1024; | |
const K: usize = 1024; | |
var A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
var B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
// Initialize matrices | |
for (0..M * K) |i| { | |
A[i] = @floatFromInt(i % 7); | |
} | |
for (0..K * N) |i| { | |
B[i] = @floatFromInt(i % 5); | |
} | |
@memset(C, 0); | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
} | |
// Test function | |
test "tiledMatMul correctness" { | |
const allocator = std.testing.allocator; | |
const test_sizes = [_][3]usize{ | |
.{ 128, 128, 128 }, | |
.{ 100, 100, 100 }, | |
.{ 200, 150, 175 }, | |
.{ 32, 64, 48 }, | |
.{ 47, 34, 45 }, | |
}; | |
for (test_sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
const C_ref = try allocator.alloc(f32, M * N); | |
defer allocator.free(C_ref); | |
// Initialize matrices | |
for (A, 0..) |*val, i| { | |
val.* = @floatFromInt(i % 10); | |
} | |
for (B, 0..) |*val, i| { | |
val.* = @floatFromInt((i + 1) % 10); | |
} | |
@memset(C, 0); | |
@memset(C_ref, 0); | |
// Perform tiled matrix multiplication | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
// Perform reference matrix multiplication | |
for (0..M) |i| { | |
for (0..N) |j| { | |
var sum: f32 = 0; | |
for (0..K) |k| { | |
sum += A[i * K + k] * B[k * N + j]; | |
} | |
C_ref[i * N + j] = sum; | |
} | |
} | |
// Compare results | |
for (C, C_ref) |c, c_ref| { | |
try std.testing.expectApproxEqAbs(c, c_ref, 1e-5); | |
} | |
std.debug.print("Test passed for size: M={}, N={}, K={}\n", .{ M, N, K }); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment