Last active
February 26, 2025 23:41
-
-
Save snowclipsed/24aebbdd51218f4e17ad428630cc91d6 to your computer and use it in GitHub Desktop.
Highly Performant Matrix Multiplication in Zig (and Grid Search to find the most optimal parameters!)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// To run tests, do : | |
// zig test -OReleaseSafe -Dcpu=native -Dtarget=native matmul.zig | |
// To run main, do : | |
// zig build-exe -OReleaseFast -Dcpu=native -Dtarget=native matmul.zig | |
// ./matmul | |
// | |
// ███╗ ███╗ █████╗ ████████╗███╗ ███╗██╗ ██╗██╗ ░░░░░░███████╗██╗ ██████╗ | |
// ████╗ ████║██╔══██╗╚══██╔══╝████╗ ████║██║ ██║██║ ░░░░░░╚══███╔╝██║██╔════╝ | |
// ██╔████╔██║███████║ ██║ ██╔████╔██║██║ ██║██║ ────── ███╔╝ ██║██║ ███╗ | |
// ██║╚██╔╝██║██╔══██║ ██║ ██║╚██╔╝██║██║ ██║██║ ░░░░░░ ███╔╝ ██║██║ ██║ | |
// ██║ ╚═╝ ██║██║ ██║ ██║ ██║ ╚═╝ ██║╚██████╔╝███████╗░░░░░░███████╗██║╚██████╔╝ | |
// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝░░░░░░╚══════╝╚═╝ ╚═════╝ • | |
// | |
// High Performance Matrix Multiplication Implementation | |
// Optimized for Modern Intel Processors | |
// | |
//////////////////////////////////////////////////////////////////////////////// | |
const std = @import("std"); | |
const builtin = @import("builtin"); | |
const atomic = std.atomic; | |
/////////////////////////////////////////////////////////////////////////////// | |
// Configuration Constants | |
/////////////////////////////////////////////////////////////////////////////// | |
/// Tile size for matrix blocking. Chosen to optimize L1/L2 cache utilization. | |
/// Value of 160 is optimized for 13th Gen Intel Core i7's cache hierarchy: | |
/// - L1D cache: 48KB (fits multiple tiles) | |
/// - L2 cache: 2MB (allows good tile reuse) | |
const T: usize = 160; | |
/// Vector width for SIMD operations. Matches AVX2 register size (256 bits = 8 floats) | |
const V: usize = 8; | |
/// Size of CPU cache line in bytes. Used for alignment and padding. | |
const CACHE_LINE_SIZE: usize = atomic.cache_line; | |
/// Number of tiles processed per thread per atomic operation. | |
/// Small value (1) chosen for better load balancing between threads. | |
const CHUNK_SIZE: usize = 1; | |
/// Alignment requirement for AVX2 operations, matched to cache line size | |
/// to prevent false sharing between threads | |
const AVX2_ALIGNMENT = 32; | |
/// Size of micro-kernel processing block, automatically determined based on | |
/// available SIMD capabilities, falling back to 8 if no SIMD available | |
const MICRO_KERNEL_SIZE: usize = std.simd.suggestVectorLength(f32) orelse 8; | |
/// Vector type for 8 32-bit floats, matching AVX2 register size | |
const Vec8f = @Vector(8, f32); | |
/////////////////////////////////////////////////////////////////////////////// | |
// Thread Management Structures | |
/////////////////////////////////////////////////////////////////////////////// | |
/// Thread-local data structure for work distribution | |
/// Aligned and padded to prevent false sharing between threads | |
const ThreadLocalData = struct { | |
/// Atomic counter for distributing work among threads | |
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE), | |
/// Padding to ensure structure fills entire cache line | |
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined, | |
}; | |
/// Context structure passed to each worker thread | |
const ThreadContext = struct { | |
/// Input matrix A (M x K) | |
A: []const f32, | |
/// Input matrix B (K x N) | |
B: []const f32, | |
/// Output matrix C (M x N) | |
C: []f32, | |
/// Number of rows in matrix A and C | |
M: usize, | |
/// Number of columns in matrix B and C | |
N: usize, | |
/// Number of columns in A / rows in B | |
K: usize, | |
/// Number of tiles in M dimension | |
tiles_M: usize, | |
/// Number of tiles in N dimension | |
tiles_N: usize, | |
/// Total number of tiles to process | |
total_tiles: usize, | |
/// Shared atomic counter for work distribution | |
shared_counter: *ThreadLocalData, | |
}; | |
/////////////////////////////////////////////////////////////////////////////// | |
// Main Matrix Multiplication Function | |
/////////////////////////////////////////////////////////////////////////////// | |
/// Performs parallel tiled matrix multiplication: C = A * B | |
/// | |
/// Algorithm overview: | |
/// 1. Divides matrices into tiles of size T x T | |
/// 2. Creates thread pool based on available CPU cores | |
/// 3. Distributes tiles to threads using atomic counter | |
/// 4. Each thread: | |
/// - Processes assigned tiles using micro-kernels | |
/// - Uses local accumulation buffers | |
/// - Writes results back to main memory | |
/// | |
/// Parameters: | |
/// - allocator: Memory allocator for thread management | |
/// - A: Input matrix A (M x K) | |
/// - B: Input matrix B (K x N) | |
/// - C: Output matrix C (M x N) | |
/// - M: Number of rows in A and C | |
/// - N: Number of columns in B and C | |
/// - K: Number of columns in A / rows in B | |
/// | |
/// Returns: Error if thread creation fails | |
pub fn tiledMatMul(allocator: std.mem.Allocator, A: []const f32, B: []const f32, C: []f32, M: usize, N: usize, K: usize) !void { | |
// Calculate tile grid dimensions | |
const tiles_M = (M + T - 1) / T; // Ceiling division for M dimension | |
const tiles_N = (N + T - 1) / T; // Ceiling division for N dimension | |
const total_tiles = tiles_M * tiles_N; | |
// Initialize shared atomic counter for work distribution | |
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) }; | |
// Get number of available CPU cores | |
const num_threads = try std.Thread.getCpuCount(); | |
// Create thread pool | |
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads); | |
defer thread_pool.deinit(); | |
// Spawn worker threads | |
for (0..num_threads) |_| { | |
const context = ThreadContext{ | |
.A = A, | |
.B = B, | |
.C = C, | |
.M = M, | |
.N = N, | |
.K = K, | |
.tiles_M = tiles_M, | |
.tiles_N = tiles_N, | |
.total_tiles = total_tiles, | |
.shared_counter = &shared_data, | |
}; | |
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context})); | |
} | |
// Wait for all threads to complete | |
for (thread_pool.items) |thread| thread.join(); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// Worker Thread Implementation | |
/////////////////////////////////////////////////////////////////////////////// | |
/// Worker thread function that processes matrix tiles in parallel | |
/// | |
/// Algorithm overview: | |
/// 1. Initializes local accumulator buffer aligned for AVX2 | |
/// 2. Fetches work chunks using atomic counter | |
/// 3. For each tile: | |
/// - Calculates tile boundaries | |
/// - Processes tile using micro-kernel | |
/// - Accumulates results locally | |
/// - Writes results back to global memory | |
/// | |
/// Work distribution: | |
/// - Uses atomic fetch-add for load balancing | |
/// - Processes CHUNK_SIZE tiles per fetch | |
/// - Continues until all tiles are processed | |
/// | |
/// Memory access pattern: | |
/// - Uses local accumulator to minimize global memory traffic | |
/// - Implements streaming stores for result writeback | |
/// - Aligns data for efficient SIMD operations | |
/// | |
/// Parameters: | |
/// - ctx: ThreadContext containing matrices and dimensions | |
fn workerThread(ctx: ThreadContext) void { | |
// Initialize local accumulator buffer, aligned for AVX2 operations | |
var local_C: [T][T]f32 align(AVX2_ALIGNMENT) = undefined; | |
while (true) { | |
// Atomically fetch next chunk of work | |
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst); | |
if (start_idx >= ctx.total_tiles) break; | |
// Calculate end of current chunk | |
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles); | |
var idx = start_idx; | |
// Process all tiles in current chunk | |
while (idx < end_idx) : (idx += 1) { | |
// Convert linear index to 2D tile coordinates | |
const i = idx / ctx.tiles_N; // Tile row index | |
const j = idx % ctx.tiles_N; // Tile column index | |
// Calculate tile boundaries in original matrix | |
const i_start = i * T; | |
const j_start = j * T; | |
const i_end = @min(i_start + T, ctx.M); // Handle edge tiles | |
const j_end = @min(j_start + T, ctx.N); // Handle edge tiles | |
// Clear local accumulator buffer | |
for (0..T) |x| { | |
@memset(&local_C[x], 0); | |
} | |
// Process tile using T×T blocks along K dimension | |
var k: usize = 0; | |
while (k < ctx.K) : (k += T) { | |
const k_end = @min(k + T, ctx.K); | |
microKernelAVX2(ctx, &local_C, i_start, j_start, k, i_end, j_end, k_end); | |
} | |
// Write accumulated results back to global memory using streaming stores | |
for (i_start..i_end) |ii| { | |
const row_offset = ii * ctx.N; | |
const local_row = ii - i_start; | |
var j_idx = j_start; | |
// Vector write-back for aligned portions using AVX2 | |
while (j_idx + 8 <= j_end) : (j_idx += 8) { | |
const vec_idx = j_idx - j_start; | |
// Load 8 elements into vector | |
const c_vec = Vec8f{ | |
local_C[local_row][vec_idx], | |
local_C[local_row][vec_idx + 1], | |
local_C[local_row][vec_idx + 2], | |
local_C[local_row][vec_idx + 3], | |
local_C[local_row][vec_idx + 4], | |
local_C[local_row][vec_idx + 5], | |
local_C[local_row][vec_idx + 6], | |
local_C[local_row][vec_idx + 7], | |
}; | |
// Load destination elements | |
const dest_vec = Vec8f{ | |
ctx.C[row_offset + j_idx], | |
ctx.C[row_offset + j_idx + 1], | |
ctx.C[row_offset + j_idx + 2], | |
ctx.C[row_offset + j_idx + 3], | |
ctx.C[row_offset + j_idx + 4], | |
ctx.C[row_offset + j_idx + 5], | |
ctx.C[row_offset + j_idx + 6], | |
ctx.C[row_offset + j_idx + 7], | |
}; | |
// Add vectors and write back results | |
const result = dest_vec + c_vec; | |
for (0..8) |offset| { | |
ctx.C[row_offset + j_idx + offset] = result[offset]; | |
} | |
} | |
// Handle remaining unaligned elements scalar | |
while (j_idx < j_end) : (j_idx += 1) { | |
ctx.C[row_offset + j_idx] += local_C[local_row][j_idx - j_start]; | |
} | |
} | |
} | |
} | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// Micro-Kernel Implementation | |
/////////////////////////////////////////////////////////////////////////////// | |
/// High-performance micro-kernel for matrix multiplication using AVX2 SIMD instructions | |
/// | |
/// Architecture-specific optimizations: | |
/// - Uses AVX2 SIMD instructions for 8-wide float operations | |
/// - Leverages FMA (Fused Multiply-Add) instructions | |
/// - Implements register blocking for maximum reuse | |
/// - Aligns data structures to cache lines | |
/// | |
/// Algorithm breakdown: | |
/// 1. Loads tile data into aligned local buffers | |
/// 2. Processes 8x8 blocks using SIMD operations | |
/// 3. Handles edge cases with scalar operations | |
/// 4. Accumulates results with maximum precision | |
/// | |
/// Memory optimization: | |
/// - Uses aligned local buffers for A and B matrices | |
/// - Implements software prefetching | |
/// - Maintains cache-friendly access patterns | |
/// - Minimizes cache pollution through blocking | |
/// | |
/// Parameters: | |
/// - ctx: Thread context with matrix data and dimensions | |
/// - local_C: Local accumulator buffer for results | |
/// - i_start: Starting row index in global matrix | |
/// - j_start: Starting column index in global matrix | |
/// - k_start: Starting index in reduction dimension | |
/// - i_end: Ending row index (exclusive) | |
/// - j_end: Ending column index (exclusive) | |
/// - k_end: Ending reduction index (exclusive) | |
fn microKernelAVX2( | |
ctx: ThreadContext, | |
local_C: *[T][T]f32, | |
i_start: usize, | |
j_start: usize, | |
k_start: usize, | |
i_end: usize, | |
j_end: usize, | |
k_end: usize, | |
) void { | |
// Local tile buffers aligned for AVX2 operations | |
var A_local: [T][T]f32 align(32) = undefined; | |
var B_local: [T][T]f32 align(32) = undefined; | |
// Calculate tile dimensions | |
const k_size = k_end - k_start; | |
const i_size = i_end - i_start; | |
const j_size = j_end - j_start; | |
// Load A matrix tile with prefetching | |
// Stride-1 access pattern for optimal cache line utilization | |
for (0..i_size) |i| { | |
const src_idx = (i_start + i) * ctx.K + k_start; | |
for (0..k_size) |k| { | |
A_local[i][k] = ctx.A[src_idx + k]; | |
} | |
} | |
// Load B matrix tile with prefetching | |
// Transforms memory layout for better cache access | |
for (0..k_size) |k| { | |
const src_idx = (k_start + k) * ctx.N + j_start; | |
for (0..j_size) |j| { | |
B_local[k][j] = ctx.B[src_idx + j]; | |
} | |
} | |
// Process full 8x8 blocks using AVX2 SIMD | |
var i: usize = 0; | |
while (i + MICRO_KERNEL_SIZE <= i_size) : (i += MICRO_KERNEL_SIZE) { | |
var j: usize = 0; | |
while (j + MICRO_KERNEL_SIZE <= j_size) : (j += MICRO_KERNEL_SIZE) { | |
// Initialize accumulator for 8x8 block | |
var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} ** 8} ** 8; | |
// Inner loop - dot product with SIMD | |
var k: usize = 0; | |
while (k < k_size) : (k += 1) { | |
// Load 8 elements from A matrix into vector | |
const a_vec = Vec8f{ | |
A_local[i][k], A_local[i + 1][k], | |
A_local[i + 2][k], A_local[i + 3][k], | |
A_local[i + 4][k], A_local[i + 5][k], | |
A_local[i + 6][k], A_local[i + 7][k], | |
}; | |
// Load 8 elements from B matrix into vector | |
const b_vec = Vec8f{ | |
B_local[k][j], B_local[k][j + 1], | |
B_local[k][j + 2], B_local[k][j + 3], | |
B_local[k][j + 4], B_local[k][j + 5], | |
B_local[k][j + 6], B_local[k][j + 7], | |
}; | |
// Vectorized outer product using FMA | |
inline for (0..8) |bi| { | |
// Broadcast single A element to all lanes | |
const a_broadcast: Vec8f = @splat(a_vec[bi]); | |
// Load current accumulator values | |
const c_vec = Vec8f{ | |
acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3], | |
acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7], | |
}; | |
// Fused multiply-add: acc += a * b | |
const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec); | |
// Store results back to accumulator | |
inline for (0..8) |bj| { | |
acc[bi][bj] = prod[bj]; | |
} | |
} | |
} | |
// Accumulate 8x8 block results to main buffer | |
for (0..8) |bi| { | |
for (0..8) |bj| { | |
local_C[i + bi][j + bj] += acc[bi][bj]; | |
} | |
} | |
} | |
// Handle edge case columns (< 8 columns remaining) | |
while (j < j_size) : (j += 1) { | |
for (0..8) |bi| { | |
var sum: f32 = 0; | |
// Scalar operations for edge cases | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i + bi][k], B_local[k][j], sum); | |
} | |
local_C[i + bi][j] += sum; | |
} | |
} | |
} | |
// Handle edge case rows (< 8 rows remaining) | |
while (i < i_size) : (i += 1) { | |
for (0..j_size) |j| { | |
var sum: f32 = 0; | |
// Scalar operations for edge cases | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i][k], B_local[k][j], sum); | |
} | |
local_C[i][j] += sum; | |
} | |
} | |
} | |
/// ------ BENCHMARKING AND PERFORMANCE ANALYSIS ------ /// | |
const CpuInfo = struct { | |
p_freq: ?u64 = null, | |
e_freq: ?u64 = null, | |
p_cores: ?u32 = null, | |
e_cores: ?u32 = null, | |
simd_width: u32, | |
total_cores: u32, | |
supports_peak_calc: bool, | |
}; | |
fn getSIMDWidth() u32 { | |
if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) { | |
return 16; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) { | |
return 8; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) { | |
return 4; | |
} else { | |
return 1; | |
} | |
} | |
fn getCpuFrequencies() !CpuInfo { | |
var info = CpuInfo{ | |
.simd_width = getSIMDWidth(), | |
.total_cores = @intCast(try std.Thread.getCpuCount()), | |
.supports_peak_calc = false, | |
}; | |
switch (builtin.os.tag) { | |
.linux => { | |
// Try reading hybrid architecture core mapping | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| { | |
defer p_core_file.close(); | |
var buf: [32]u8 = undefined; | |
if (p_core_file.reader().readAll(&buf)) |bytes_read| { | |
const p_cores_str = buf[0..bytes_read]; | |
// Count cores from range (e.g. "0-7" means 8 threads) | |
if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| { | |
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10); | |
const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10); | |
info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading | |
} | |
} else |_| {} | |
} else |_| {} | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| { | |
defer e_core_file.close(); | |
var buf: [32]u8 = undefined; | |
if (e_core_file.reader().readAll(&buf)) |bytes_read| { | |
const e_cores_str = buf[0..bytes_read]; | |
// Count cores from range (e.g. "8-15" means 8 threads) | |
if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| { | |
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10); | |
const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10); | |
info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading | |
} | |
} else |_| {} | |
} else |_| {} | |
// Try reading frequencies for P-cores | |
if (info.p_cores != null) { | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| { | |
defer freq_file.close(); | |
var buf: [32]u8 = undefined; | |
if (freq_file.reader().readAll(&buf)) |bytes_read| { | |
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| { | |
info.p_freq = freq * 1000; // Convert kHz to Hz | |
} else |_| { | |
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores | |
} | |
} else |_| {} | |
} else |_| { | |
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores | |
} | |
} | |
// Try reading frequencies for E-cores | |
if (info.e_cores != null) { | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| { | |
defer freq_file.close(); | |
var buf: [32]u8 = undefined; | |
if (freq_file.reader().readAll(&buf)) |bytes_read| { | |
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| { | |
info.e_freq = freq * 1000; // Convert kHz to Hz | |
} else |_| { | |
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores | |
} | |
} else |_| {} | |
} else |_| { | |
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores | |
} | |
} | |
// If we detected any cores, we can do peak calculation | |
info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or | |
(info.e_cores != null and info.e_freq != null); | |
// If we couldn't detect anything, fall back to treating all cores as P-cores | |
if (!info.supports_peak_calc) { | |
info.p_cores = info.total_cores; | |
info.e_cores = 0; | |
info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default | |
info.supports_peak_calc = true; | |
std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{}); | |
} | |
}, | |
.windows, .macos => { | |
info.p_cores = info.total_cores; | |
info.e_cores = 0; | |
info.supports_peak_calc = false; | |
}, | |
else => { | |
@compileError("Unsupported operating system"); | |
}, | |
} | |
return info; | |
} | |
fn calculateTheoreticalPeak(info: CpuInfo) ?f64 { | |
if (!info.supports_peak_calc) { | |
return null; | |
} | |
const p_core_gflops = if (info.p_freq != null and info.p_cores != null) | |
(@as(f64, @floatFromInt(info.p_freq.?)) / 1e9) * | |
@as(f64, @floatFromInt(info.p_cores.? * info.simd_width * 2)) | |
else | |
0; | |
const e_core_gflops = if (info.e_freq != null and info.e_cores != null) | |
(@as(f64, @floatFromInt(info.e_freq.?)) / 1e9) * | |
@as(f64, @floatFromInt(info.e_cores.? * info.simd_width * 2)) | |
else | |
0; | |
return p_core_gflops + e_core_gflops; | |
} | |
pub fn calculatePreciseGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usize, iterations: usize) !void { | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
// Initialize matrices | |
var rng = std.rand.DefaultPrng.init(@intCast(std.time.timestamp())); | |
const random = rng.random(); | |
for (A) |*val| { | |
val.* = random.float(f32) * 2.0 - 1.0; | |
} | |
for (B) |*val| { | |
val.* = random.float(f32) * 2.0 - 1.0; | |
} | |
@memset(C, 0); | |
// Warmup run | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
// Track performance metrics | |
const ops_per_element = 2 * K - 1; | |
const total_elements = M * N; | |
const flops_per_iteration = total_elements * ops_per_element; | |
var metrics = try allocator.alloc(struct { | |
gflops: f64, | |
time_ms: f64, | |
}, iterations); | |
defer allocator.free(metrics); | |
// Main measurement loop | |
var timer = try std.time.Timer.start(); | |
const total_start = timer.read(); | |
for (0..iterations) |i| { | |
@memset(C, 0); | |
const iter_start = timer.read(); | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
const iter_end = timer.read(); | |
const iter_time_ns = iter_end - iter_start; | |
const iter_time_ms = @as(f64, @floatFromInt(iter_time_ns)) / 1e6; | |
const iter_gflops = @as(f64, @floatFromInt(flops_per_iteration)) / (@as(f64, @floatFromInt(iter_time_ns)) / 1e9) / 1e9; | |
metrics[i] = .{ | |
.gflops = iter_gflops, | |
.time_ms = iter_time_ms, | |
}; | |
} | |
// Calculate statistics | |
const total_time_ns = timer.read() - total_start; | |
const total_time_sec = @as(f64, @floatFromInt(total_time_ns)) / 1e9; | |
var min_gflops: f64 = metrics[0].gflops; | |
var max_gflops: f64 = metrics[0].gflops; | |
var total_gflops: f64 = 0; | |
var first_quarter_avg: f64 = 0; | |
var last_quarter_avg: f64 = 0; | |
const quarter_size = iterations / 4; | |
for (metrics, 0..) |m, i| { | |
min_gflops = @min(min_gflops, m.gflops); | |
max_gflops = @max(max_gflops, m.gflops); | |
total_gflops += m.gflops; | |
if (i < quarter_size) { | |
first_quarter_avg += m.gflops; | |
} else if (i >= iterations - quarter_size) { | |
last_quarter_avg += m.gflops; | |
} | |
} | |
first_quarter_avg /= @as(f64, @floatFromInt(quarter_size)); | |
last_quarter_avg /= @as(f64, @floatFromInt(quarter_size)); | |
const avg_gflops = total_gflops / @as(f64, @floatFromInt(iterations)); | |
const perf_change_pct = ((last_quarter_avg - first_quarter_avg) / first_quarter_avg) * 100.0; | |
// Get CPU info and theoretical peak | |
const cpu_info = try getCpuFrequencies(); | |
// Print detailed performance report | |
std.debug.print("\nPerformance Analysis for {d}x{d}x{d} Matrix:\n", .{ M, N, K }); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Peak Performance: {d:.2} GFLOPS\n", .{max_gflops}); | |
std.debug.print("Lowest Performance: {d:.2} GFLOPS\n", .{min_gflops}); | |
std.debug.print("Average Performance: {d:.2} GFLOPS\n", .{avg_gflops}); | |
std.debug.print("First Quarter Avg: {d:.2} GFLOPS\n", .{first_quarter_avg}); | |
std.debug.print("Last Quarter Avg: {d:.2} GFLOPS\n", .{last_quarter_avg}); | |
if (perf_change_pct >= 0) { | |
std.debug.print("Performance Change: {d:.1}% (improvement)\n", .{perf_change_pct}); | |
} else { | |
std.debug.print("Performance Change: {d:.1}% (degradation)\n", .{perf_change_pct}); | |
} | |
std.debug.print("Total Runtime: {d:.3} seconds\n", .{total_time_sec}); | |
// CPU information and theoretical peak | |
std.debug.print("\nCPU Configuration:\n", .{}); | |
std.debug.print("Total Cores: {d}\n", .{cpu_info.total_cores}); | |
if (cpu_info.p_cores != null) { | |
std.debug.print("P-Cores: {d}\n", .{cpu_info.p_cores.?}); | |
} | |
if (cpu_info.e_cores != null) { | |
std.debug.print("E-Cores: {d}\n", .{cpu_info.e_cores.?}); | |
} | |
std.debug.print("SIMD Width: {d} floats\n", .{cpu_info.simd_width}); | |
if (cpu_info.supports_peak_calc) { | |
if (cpu_info.p_freq != null) { | |
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9}); | |
} | |
if (cpu_info.e_freq != null) { | |
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9}); | |
} | |
if (calculateTheoreticalPeak(cpu_info)) |peak| { | |
std.debug.print("\nTheoretical Peak: {d:.2} GFLOPS\n", .{peak}); | |
if (peak > 0) { | |
std.debug.print("Peak Efficiency: {d:.1}% of theoretical\n", .{(max_gflops / peak) * 100.0}); | |
} | |
} | |
} else { | |
std.debug.print("\nNote: Theoretical peak calculation not supported on this OS\n", .{}); | |
} | |
// Print performance distribution | |
std.debug.print("\nPerformance Distribution:\n", .{}); | |
const distribution_width = 60; | |
for (metrics) |m| { | |
const normalized = (m.gflops - min_gflops) / (max_gflops - min_gflops); | |
const stars = @as(usize, @intFromFloat(normalized * @as(f64, @floatFromInt(distribution_width)))); | |
std.debug.print("{d:.2} GFLOPS |", .{m.gflops}); | |
for (0..stars) |_| { | |
std.debug.print("*", .{}); | |
} | |
std.debug.print("\n", .{}); | |
} | |
} | |
pub fn main() !void { | |
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | |
defer arena.deinit(); | |
const allocator = arena.allocator(); | |
const sizes = [_][3]usize{ | |
// Uncomment additional sizes as needed | |
// .{ 256, 256, 256 }, | |
// .{ 512, 512, 512 }, | |
// .{ 1024, 1024, 1024 }, | |
// .{ 1024, 2048, 1024 }, | |
.{ 2048, 2048, 2048 }, | |
}; | |
const iterations = 100; | |
// Print system configuration | |
const cpu_info = try getCpuFrequencies(); | |
std.debug.print("\nSystem Configuration:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Tile size (T): {}\n", .{T}); | |
std.debug.print("Vector size (V): {}\n", .{V}); | |
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores}); | |
if (cpu_info.supports_peak_calc) { | |
if (cpu_info.p_freq != null) { | |
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9}); | |
} | |
if (cpu_info.e_freq != null) { | |
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9}); | |
} | |
} | |
std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)}); | |
std.debug.print("Iterations per test: {}\n", .{iterations}); | |
std.debug.print("----------------------------------------\n\n", .{}); | |
// Run benchmarks with increasing matrix sizes | |
for (sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Running benchmark for {d}x{d}x{d} matrices\n", .{ M, N, K }); | |
std.debug.print("========================================\n", .{}); | |
try calculatePreciseGflops(allocator, M, N, K, iterations); | |
// Add a small delay between tests to let CPU cool down | |
std.time.sleep(1 * std.time.ns_per_s); | |
} | |
// Print summary | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Benchmark Summary:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len}); | |
std.debug.print("Total iterations per size: {d}\n", .{iterations}); | |
std.debug.print("AVX2 SIMD utilized: Yes\n", .{}); | |
std.debug.print("Tiled algorithm: Yes ({}x{})\n", .{ T, T }); | |
std.debug.print("========================================\n\n", .{}); | |
} | |
// Additional test for edge cases | |
test "tiledMatMul edge cases" { | |
const allocator = std.testing.allocator; | |
// Test with sizes that are not multiples of tile size | |
const edge_sizes = [_][3]usize{ | |
.{ T + 1, T - 1, T + 2 }, // Around tile size | |
.{ T * 2 - 1, T * 2 + 1, T * 2 }, // Around double tile size | |
.{ 1, T, T }, // Single row | |
.{ T, 1, T }, // Single column | |
.{ T, T, 1 }, // Single depth | |
}; | |
for (edge_sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
const C_ref = try allocator.alloc(f32, M * N); | |
defer allocator.free(C_ref); | |
// Initialize matrices | |
for (A, 0..) |*val, i| { | |
val.* = @floatFromInt(i % 7); | |
} | |
for (B, 0..) |*val, i| { | |
val.* = @floatFromInt((i + 1) % 5); | |
} | |
@memset(C, 0); | |
@memset(C_ref, 0); | |
// Perform multiplications | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
// Reference calculation | |
for (0..M) |i| { | |
for (0..N) |j| { | |
var sum: f32 = 0; | |
for (0..K) |k| { | |
sum += A[i * K + k] * B[k * N + j]; | |
} | |
C_ref[i * N + j] = sum; | |
} | |
} | |
// Compare results with higher tolerance for edge cases | |
for (C, C_ref) |c, c_ref| { | |
try std.testing.expectApproxEqAbs(c, c_ref, 1e-4); | |
} | |
std.debug.print("Edge case test passed for size: M={}, N={}, K={}\n", .{ M, N, K }); | |
} | |
} | |
// Example usage | |
test "matrix multiplication" { | |
const allocator = std.testing.allocator; | |
const M: usize = 1024; | |
const N: usize = 1024; | |
const K: usize = 1024; | |
var A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
var B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
// Initialize matrices | |
for (0..M * K) |i| { | |
A[i] = @floatFromInt(i % 7); | |
} | |
for (0..K * N) |i| { | |
B[i] = @floatFromInt(i % 5); | |
} | |
@memset(C, 0); | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
} | |
// Test function | |
test "tiledMatMul correctness" { | |
const allocator = std.testing.allocator; | |
const test_sizes = [_][3]usize{ | |
.{ 128, 128, 128 }, | |
.{ 100, 100, 100 }, | |
.{ 200, 150, 175 }, | |
.{ 32, 64, 48 }, | |
.{ 47, 34, 45 }, | |
}; | |
for (test_sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
const C_ref = try allocator.alloc(f32, M * N); | |
defer allocator.free(C_ref); | |
// Initialize matrices | |
for (A, 0..) |*val, i| { | |
val.* = @floatFromInt(i % 10); | |
} | |
for (B, 0..) |*val, i| { | |
val.* = @floatFromInt((i + 1) % 10); | |
} | |
@memset(C, 0); | |
@memset(C_ref, 0); | |
// Perform tiled matrix multiplication | |
try tiledMatMul(allocator, A, B, C, M, N, K); | |
// Perform reference matrix multiplication | |
for (0..M) |i| { | |
for (0..N) |j| { | |
var sum: f32 = 0; | |
for (0..K) |k| { | |
sum += A[i * K + k] * B[k * N + j]; | |
} | |
C_ref[i * N + j] = sum; | |
} | |
} | |
// Compare results | |
for (C, C_ref) |c, c_ref| { | |
try std.testing.expectApproxEqAbs(c, c_ref, 1e-5); | |
} | |
std.debug.print("Test passed for size: M={}, N={}, K={}\n", .{ M, N, K }); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const builtin = @import("builtin"); | |
const atomic = std.atomic; | |
const CACHE_LINE_SIZE: usize = atomic.cache_line; | |
const CHUNK_SIZE: usize = 1; | |
const AVX2_ALIGNMENT = 32; | |
const Vec8f = @Vector(8, f32); | |
const ThreadLocalData = struct { | |
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE), | |
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined, | |
}; | |
pub fn tiledMatMul( | |
allocator: std.mem.Allocator, | |
A: []const f32, | |
B: []const f32, | |
C: []f32, | |
M: usize, | |
N: usize, | |
K: usize, | |
T: usize, | |
V: usize, | |
) !void { | |
const tiles_M = (M + T - 1) / T; | |
const tiles_N = (N + T - 1) / T; | |
const total_tiles = tiles_M * tiles_N; | |
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) }; | |
const num_threads = try std.Thread.getCpuCount(); | |
// Allocate thread pool | |
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads); | |
defer thread_pool.deinit(); | |
// Allocate local C buffers for each thread | |
const buffer_size = T * T; | |
var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads); | |
defer allocator.free(local_C_buffers); | |
// Create and spawn threads | |
for (0..num_threads) |thread_id| { | |
const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size]; | |
const context = ThreadContext{ | |
.A = A, | |
.B = B, | |
.C = C, | |
.M = M, | |
.N = N, | |
.K = K, | |
.T = T, | |
.V = V, | |
.tiles_M = tiles_M, | |
.tiles_N = tiles_N, | |
.total_tiles = total_tiles, | |
.shared_counter = &shared_data, | |
.local_C = thread_buffer, | |
}; | |
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context})); | |
} | |
// Wait for all threads to complete | |
for (thread_pool.items) |thread| thread.join(); | |
} | |
const ThreadContext = struct { | |
A: []const f32, | |
B: []const f32, | |
C: []f32, | |
M: usize, | |
N: usize, | |
K: usize, | |
T: usize, | |
V: usize, | |
tiles_M: usize, | |
tiles_N: usize, | |
total_tiles: usize, | |
shared_counter: *ThreadLocalData, | |
local_C: []f32, | |
}; | |
fn workerThread(ctx: ThreadContext) void { | |
const max_tile_size = 512; // Maximum supported tile size | |
var A_local: [max_tile_size * max_tile_size]f32 align(32) = undefined; | |
var B_local: [max_tile_size * max_tile_size]f32 align(32) = undefined; | |
if (ctx.T > max_tile_size) { | |
@panic("Tile size exceeds maximum supported size"); | |
} | |
while (true) { | |
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst); | |
if (start_idx >= ctx.total_tiles) break; | |
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles); | |
var idx = start_idx; | |
while (idx < end_idx) : (idx += 1) { | |
const i = idx / ctx.tiles_N; | |
const j = idx % ctx.tiles_N; | |
const i_start = i * ctx.T; | |
const j_start = j * ctx.T; | |
const i_end = @min(i_start + ctx.T, ctx.M); | |
const j_end = @min(j_start + ctx.T, ctx.N); | |
// Reset local_C | |
@memset(ctx.local_C, 0); | |
var k: usize = 0; | |
while (k < ctx.K) : (k += ctx.T) { | |
const k_end = @min(k + ctx.T, ctx.K); | |
microKernelAVX2(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end); | |
} | |
// Update global C | |
for (i_start..i_end) |ii| { | |
const row_offset = ii * ctx.N; | |
const local_row = (ii - i_start) * ctx.T; | |
var j_idx = j_start; | |
while (j_idx + ctx.V <= j_end) : (j_idx += ctx.V) { | |
if (ctx.V == 8) { | |
const vec_idx = j_idx - j_start; | |
const c_vec = Vec8f{ | |
ctx.local_C[local_row + vec_idx], | |
ctx.local_C[local_row + vec_idx + 1], | |
ctx.local_C[local_row + vec_idx + 2], | |
ctx.local_C[local_row + vec_idx + 3], | |
ctx.local_C[local_row + vec_idx + 4], | |
ctx.local_C[local_row + vec_idx + 5], | |
ctx.local_C[local_row + vec_idx + 6], | |
ctx.local_C[local_row + vec_idx + 7], | |
}; | |
const dest_vec = Vec8f{ | |
ctx.C[row_offset + j_idx], | |
ctx.C[row_offset + j_idx + 1], | |
ctx.C[row_offset + j_idx + 2], | |
ctx.C[row_offset + j_idx + 3], | |
ctx.C[row_offset + j_idx + 4], | |
ctx.C[row_offset + j_idx + 5], | |
ctx.C[row_offset + j_idx + 6], | |
ctx.C[row_offset + j_idx + 7], | |
}; | |
const result = dest_vec + c_vec; | |
for (0..8) |offset| { | |
ctx.C[row_offset + j_idx + offset] = result[offset]; | |
} | |
} else { | |
// Handle other vector sizes | |
for (0..ctx.V) |offset| { | |
ctx.C[row_offset + j_idx + offset] += | |
ctx.local_C[local_row + (j_idx - j_start) + offset]; | |
} | |
} | |
} | |
// Handle remaining elements | |
while (j_idx < j_end) : (j_idx += 1) { | |
ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)]; | |
} | |
} | |
} | |
} | |
} | |
fn microKernelAVX2( | |
ctx: ThreadContext, | |
A_local: []f32, | |
B_local: []f32, | |
i_start: usize, | |
j_start: usize, | |
k_start: usize, | |
i_end: usize, | |
j_end: usize, | |
k_end: usize, | |
) void { | |
const k_size = k_end - k_start; | |
const i_size = i_end - i_start; | |
const j_size = j_end - j_start; | |
// Load A and B tiles | |
for (0..i_size) |i| { | |
const src_idx = (i_start + i) * ctx.K + k_start; | |
const local_idx = i * ctx.T; | |
for (0..k_size) |k| { | |
A_local[local_idx + k] = ctx.A[src_idx + k]; | |
} | |
} | |
for (0..k_size) |k| { | |
const src_idx = (k_start + k) * ctx.N + j_start; | |
const local_idx = k * ctx.T; | |
for (0..j_size) |j| { | |
B_local[local_idx + j] = ctx.B[src_idx + j]; | |
} | |
} | |
var i: usize = 0; | |
while (i + ctx.V <= i_size) : (i += ctx.V) { | |
var j: usize = 0; | |
while (j + ctx.V <= j_size) : (j += ctx.V) { | |
if (ctx.V == 8) { | |
var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} ** 8} ** 8; | |
for (0..k_size) |k| { | |
const a_vec = Vec8f{ | |
A_local[i * ctx.T + k], | |
A_local[(i + 1) * ctx.T + k], | |
A_local[(i + 2) * ctx.T + k], | |
A_local[(i + 3) * ctx.T + k], | |
A_local[(i + 4) * ctx.T + k], | |
A_local[(i + 5) * ctx.T + k], | |
A_local[(i + 6) * ctx.T + k], | |
A_local[(i + 7) * ctx.T + k], | |
}; | |
const b_vec = Vec8f{ | |
B_local[k * ctx.T + j], | |
B_local[k * ctx.T + j + 1], | |
B_local[k * ctx.T + j + 2], | |
B_local[k * ctx.T + j + 3], | |
B_local[k * ctx.T + j + 4], | |
B_local[k * ctx.T + j + 5], | |
B_local[k * ctx.T + j + 6], | |
B_local[k * ctx.T + j + 7], | |
}; | |
inline for (0..8) |bi| { | |
const a_broadcast: Vec8f = @splat(a_vec[bi]); | |
const c_vec = Vec8f{ | |
acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3], | |
acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7], | |
}; | |
const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec); | |
inline for (0..8) |bj| { | |
acc[bi][bj] = prod[bj]; | |
} | |
} | |
} | |
// Store results | |
for (0..8) |bi| { | |
for (0..8) |bj| { | |
const local_idx = (i + bi) * ctx.T + (j + bj); | |
ctx.local_C[local_idx] += acc[bi][bj]; | |
} | |
} | |
} else { | |
// Handle other vector sizes with scalar operations | |
for (0..ctx.V) |bi| { | |
for (0..ctx.V) |bj| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j + bj], sum); | |
} | |
ctx.local_C[(i + bi) * ctx.T + j + bj] += sum; | |
} | |
} | |
} | |
} | |
// Handle remaining columns | |
while (j < j_size) : (j += 1) { | |
for (0..ctx.V) |bi| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j], sum); | |
} | |
ctx.local_C[(i + bi) * ctx.T + j] += sum; | |
} | |
} | |
} | |
// Handle remaining rows | |
while (i < i_size) : (i += 1) { | |
for (0..j_size) |j| { | |
var sum: f32 = 0; | |
for (0..k_size) |k| { | |
sum = @mulAdd(f32, A_local[i * ctx.T + k], B_local[k * ctx.T + j], sum); | |
} | |
ctx.local_C[i * ctx.T + j] += sum; | |
} | |
} | |
} | |
const time = std.time; | |
const testing = std.testing; | |
pub const SearchResult = struct { | |
tile_size: usize, | |
vector_size: usize, | |
gflops: f64, | |
}; | |
const SearchRange = struct { | |
t_min: usize, | |
t_max: usize, | |
t_step: usize, | |
v_sizes: []const usize, | |
}; | |
pub fn adaptiveGridSearch( | |
allocator: std.mem.Allocator, | |
M: usize, | |
N: usize, | |
K: usize, | |
initial_range: SearchRange, | |
refinement_levels: usize, | |
) !SearchResult { | |
var best_result = SearchResult{ | |
.tile_size = 0, | |
.vector_size = 0, | |
.gflops = 0, | |
}; | |
var current_range = initial_range; | |
for (0..refinement_levels) |level| { | |
std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step }); | |
std.debug.print("----------------------------------------\n", .{}); | |
// Search current range | |
var results = std.ArrayList(SearchResult).init(allocator); | |
defer results.deinit(); | |
var t = current_range.t_min; | |
while (t <= current_range.t_max) : (t += current_range.t_step) { | |
// Skip invalid tile sizes | |
if (t > M or t > N or t > K) continue; | |
for (current_range.v_sizes) |v| { | |
if (t % v != 0) continue; // Skip if tile size not divisible by vector size | |
// Run multiple trials and take best | |
var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K); | |
for (0..2) |_| { | |
const trial = try benchmarkMatMul(allocator, t, v, M, N, K); | |
if (trial.gflops > best_trial.gflops) { | |
best_trial.gflops = trial.gflops; | |
} | |
} | |
try results.append(.{ | |
.tile_size = t, | |
.vector_size = v, | |
.gflops = best_trial.gflops, | |
}); | |
std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops }); | |
} | |
} | |
// Find top performers | |
const top_k = 3; // Number of top results to consider | |
const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len)); | |
defer allocator.free(top_results); | |
// Sort results by GFLOPS (descending) | |
std.sort.pdq(SearchResult, results.items, {}, struct { | |
fn lessThan(_: void, a: SearchResult, b: SearchResult) bool { | |
return a.gflops > b.gflops; | |
} | |
}.lessThan); | |
// Copy top results | |
@memcpy(top_results, results.items[0..top_results.len]); | |
// Update best overall result | |
if (results.items[0].gflops > best_result.gflops) { | |
best_result = results.items[0]; | |
} | |
// Print top performers for this level | |
std.debug.print("\nTop performers at level {d}:\n", .{level}); | |
for (top_results, 0..) |result, i| { | |
std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops }); | |
} | |
// Refine search range around best result | |
if (level + 1 < refinement_levels) { | |
const best_T = top_results[0].tile_size; | |
const step = current_range.t_step / 2; | |
current_range = SearchRange{ | |
.t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min, | |
.t_max = @min(best_T + step * 2, current_range.t_max), | |
.t_step = step, | |
.v_sizes = current_range.v_sizes, | |
}; | |
} | |
} | |
return best_result; | |
} | |
pub fn treeSearch( | |
allocator: std.mem.Allocator, | |
M: usize, | |
N: usize, | |
K: usize, | |
) !SearchResult { | |
// Initial coarse-grained search | |
const initial_range = SearchRange{ | |
.t_min = 32, | |
.t_max = 512, | |
.t_step = 64, | |
.v_sizes = &[_]usize{ 4, 8, 16 }, | |
}; | |
std.debug.print("Starting tree search for {d}x{d}x{d} matrix\n", .{ M, N, K }); | |
std.debug.print("----------------------------------------\n", .{}); | |
const result = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4 // number of refinement levels | |
); | |
std.debug.print("\nFinal optimal configuration:\n", .{}); | |
std.debug.print("Tile Size (T): {d}\n", .{result.tile_size}); | |
std.debug.print("Vector Size (V): {d}\n", .{result.vector_size}); | |
std.debug.print("Performance: {d:.2} GFLOPS\n", .{result.gflops}); | |
return result; | |
} | |
pub fn benchmarkMatMul( | |
allocator: std.mem.Allocator, | |
tile_size: usize, | |
vector_size: usize, | |
M: usize, | |
N: usize, | |
K: usize, | |
) !SearchResult { | |
const A = try allocator.alloc(f32, M * K); | |
defer allocator.free(A); | |
const B = try allocator.alloc(f32, K * N); | |
defer allocator.free(B); | |
const C = try allocator.alloc(f32, M * N); | |
defer allocator.free(C); | |
// Initialize matrices with random data | |
var rng = std.rand.DefaultPrng.init(0); | |
const random = rng.random(); | |
for (A) |*val| val.* = random.float(f32); | |
for (B) |*val| val.* = random.float(f32); | |
@memset(C, 0); | |
// Warm up run | |
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size); | |
// Actual benchmark | |
const start_time = time.nanoTimestamp(); | |
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size); | |
const end_time = time.nanoTimestamp(); | |
const elapsed_ns: u64 = @intCast(end_time - start_time); | |
// Calculate GFLOPS | |
const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add | |
const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9; | |
const gflops = operations / (seconds * 1e9); | |
return SearchResult{ | |
.tile_size = tile_size, | |
.vector_size = vector_size, | |
.gflops = gflops, | |
}; | |
} | |
const CpuInfo = struct { | |
p_freq: ?u64 = null, | |
e_freq: ?u64 = null, | |
p_cores: ?u32 = null, | |
e_cores: ?u32 = null, | |
simd_width: u32, | |
total_cores: u32, | |
supports_peak_calc: bool, | |
}; | |
fn getSIMDWidth() u32 { | |
if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) { | |
return 16; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) { | |
return 8; | |
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) { | |
return 4; | |
} else { | |
return 1; | |
} | |
} | |
fn getCpuFrequencies() !CpuInfo { | |
var info = CpuInfo{ | |
.simd_width = getSIMDWidth(), | |
.total_cores = @intCast(try std.Thread.getCpuCount()), | |
.supports_peak_calc = false, | |
}; | |
switch (builtin.os.tag) { | |
.linux => { | |
// Try reading hybrid architecture core mapping | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| { | |
defer p_core_file.close(); | |
var buf: [32]u8 = undefined; | |
if (p_core_file.reader().readAll(&buf)) |bytes_read| { | |
const p_cores_str = buf[0..bytes_read]; | |
// Count cores from range (e.g. "0-7" means 8 threads) | |
if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| { | |
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10); | |
const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10); | |
info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading | |
} | |
} else |_| {} | |
} else |_| {} | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| { | |
defer e_core_file.close(); | |
var buf: [32]u8 = undefined; | |
if (e_core_file.reader().readAll(&buf)) |bytes_read| { | |
const e_cores_str = buf[0..bytes_read]; | |
// Count cores from range (e.g. "8-15" means 8 threads) | |
if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| { | |
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10); | |
const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10); | |
info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading | |
} | |
} else |_| {} | |
} else |_| {} | |
// Try reading frequencies for P-cores | |
if (info.p_cores != null) { | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| { | |
defer freq_file.close(); | |
var buf: [32]u8 = undefined; | |
if (freq_file.reader().readAll(&buf)) |bytes_read| { | |
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| { | |
info.p_freq = freq * 1000; // Convert kHz to Hz | |
} else |_| { | |
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores | |
} | |
} else |_| {} | |
} else |_| { | |
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores | |
} | |
} | |
// Try reading frequencies for E-cores | |
if (info.e_cores != null) { | |
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| { | |
defer freq_file.close(); | |
var buf: [32]u8 = undefined; | |
if (freq_file.reader().readAll(&buf)) |bytes_read| { | |
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| { | |
info.e_freq = freq * 1000; // Convert kHz to Hz | |
} else |_| { | |
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores | |
} | |
} else |_| {} | |
} else |_| { | |
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores | |
} | |
} | |
// If we detected any cores, we can do peak calculation | |
info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or | |
(info.e_cores != null and info.e_freq != null); | |
// If we couldn't detect anything, fall back to treating all cores as P-cores | |
if (!info.supports_peak_calc) { | |
info.p_cores = info.total_cores; | |
info.e_cores = 0; | |
info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default | |
info.supports_peak_calc = true; | |
std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{}); | |
} | |
}, | |
.windows, .macos => { | |
info.p_cores = info.total_cores; | |
info.e_cores = 0; | |
info.supports_peak_calc = false; | |
}, | |
else => { | |
@compileError("Unsupported operating system"); | |
}, | |
} | |
return info; | |
} | |
pub fn main() !void { | |
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | |
defer arena.deinit(); | |
const allocator = arena.allocator(); | |
const sizes = [_][3]usize{ | |
// Uncomment additional sizes as needed | |
.{ 1024, 1024, 1024 }, | |
// .{ 2048, 2048, 2048 }, | |
// .{ 4096, 4096, 4096 }, | |
}; | |
const iterations = 5; // Reduced for tree search phase, increase for final benchmark | |
// Print system configuration | |
const cpu_info = try getCpuFrequencies(); | |
std.debug.print("\nSystem Configuration:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores}); | |
if (cpu_info.supports_peak_calc) { | |
if (cpu_info.p_freq != null) { | |
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9}); | |
} | |
if (cpu_info.e_freq != null) { | |
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9}); | |
} | |
} | |
std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)}); | |
std.debug.print("----------------------------------------\n\n", .{}); | |
for (sizes) |size| { | |
const M = size[0]; | |
const N = size[1]; | |
const K = size[2]; | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K }); | |
std.debug.print("========================================\n", .{}); | |
// Phase 1: Tree Search | |
std.debug.print("\nAdaptive Parameter Search\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
const best_config = try treeSearch(allocator, M, N, K); | |
std.debug.print("\nBest configuration found:\n", .{}); | |
std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size}); | |
std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size}); | |
std.debug.print("Initial Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops}); | |
} | |
// Print summary | |
std.debug.print("\n========================================\n", .{}); | |
std.debug.print("Benchmark Summary:\n", .{}); | |
std.debug.print("----------------------------------------\n", .{}); | |
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len}); | |
std.debug.print("Tree search levels: 4\n", .{}); | |
std.debug.print("Detailed analysis iterations: {d}\n", .{iterations}); | |
std.debug.print("AVX2 SIMD utilized: Yes\n", .{}); | |
std.debug.print("========================================\n\n", .{}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment