Skip to content

Instantly share code, notes, and snippets.

@snowclipsed
Last active February 26, 2025 23:41
Show Gist options
  • Save snowclipsed/24aebbdd51218f4e17ad428630cc91d6 to your computer and use it in GitHub Desktop.
Save snowclipsed/24aebbdd51218f4e17ad428630cc91d6 to your computer and use it in GitHub Desktop.
Highly Performant Matrix Multiplication in Zig (and Grid Search to find the most optimal parameters!)
// To run tests, do :
// zig test -OReleaseSafe -Dcpu=native -Dtarget=native matmul.zig
// To run main, do :
// zig build-exe -OReleaseFast -Dcpu=native -Dtarget=native matmul.zig
// ./matmul
//
// ███╗ ███╗ █████╗ ████████╗███╗ ███╗██╗ ██╗██╗ ░░░░░░███████╗██╗ ██████╗
// ████╗ ████║██╔══██╗╚══██╔══╝████╗ ████║██║ ██║██║ ░░░░░░╚══███╔╝██║██╔════╝
// ██╔████╔██║███████║ ██║ ██╔████╔██║██║ ██║██║ ────── ███╔╝ ██║██║ ███╗
// ██║╚██╔╝██║██╔══██║ ██║ ██║╚██╔╝██║██║ ██║██║ ░░░░░░ ███╔╝ ██║██║ ██║
// ██║ ╚═╝ ██║██║ ██║ ██║ ██║ ╚═╝ ██║╚██████╔╝███████╗░░░░░░███████╗██║╚██████╔╝
// ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝░░░░░░╚══════╝╚═╝ ╚═════╝ •
//
// High Performance Matrix Multiplication Implementation
// Optimized for Modern Intel Processors
//
////////////////////////////////////////////////////////////////////////////////
const std = @import("std");
const builtin = @import("builtin");
const atomic = std.atomic;
///////////////////////////////////////////////////////////////////////////////
// Configuration Constants
///////////////////////////////////////////////////////////////////////////////
/// Tile size for matrix blocking. Chosen to optimize L1/L2 cache utilization.
/// Value of 160 is optimized for 13th Gen Intel Core i7's cache hierarchy:
/// - L1D cache: 48KB (fits multiple tiles)
/// - L2 cache: 2MB (allows good tile reuse)
const T: usize = 160;
/// Vector width for SIMD operations. Matches AVX2 register size (256 bits = 8 floats)
const V: usize = 8;
/// Size of CPU cache line in bytes. Used for alignment and padding.
const CACHE_LINE_SIZE: usize = atomic.cache_line;
/// Number of tiles processed per thread per atomic operation.
/// Small value (1) chosen for better load balancing between threads.
const CHUNK_SIZE: usize = 1;
/// Alignment requirement for AVX2 operations, matched to cache line size
/// to prevent false sharing between threads
const AVX2_ALIGNMENT = 32;
/// Size of micro-kernel processing block, automatically determined based on
/// available SIMD capabilities, falling back to 8 if no SIMD available
const MICRO_KERNEL_SIZE: usize = std.simd.suggestVectorLength(f32) orelse 8;
/// Vector type for 8 32-bit floats, matching AVX2 register size
const Vec8f = @Vector(8, f32);
///////////////////////////////////////////////////////////////////////////////
// Thread Management Structures
///////////////////////////////////////////////////////////////////////////////
/// Thread-local data structure for work distribution
/// Aligned and padded to prevent false sharing between threads
const ThreadLocalData = struct {
/// Atomic counter for distributing work among threads
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
/// Padding to ensure structure fills entire cache line
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
};
/// Context structure passed to each worker thread
const ThreadContext = struct {
/// Input matrix A (M x K)
A: []const f32,
/// Input matrix B (K x N)
B: []const f32,
/// Output matrix C (M x N)
C: []f32,
/// Number of rows in matrix A and C
M: usize,
/// Number of columns in matrix B and C
N: usize,
/// Number of columns in A / rows in B
K: usize,
/// Number of tiles in M dimension
tiles_M: usize,
/// Number of tiles in N dimension
tiles_N: usize,
/// Total number of tiles to process
total_tiles: usize,
/// Shared atomic counter for work distribution
shared_counter: *ThreadLocalData,
};
///////////////////////////////////////////////////////////////////////////////
// Main Matrix Multiplication Function
///////////////////////////////////////////////////////////////////////////////
/// Performs parallel tiled matrix multiplication: C = A * B
///
/// Algorithm overview:
/// 1. Divides matrices into tiles of size T x T
/// 2. Creates thread pool based on available CPU cores
/// 3. Distributes tiles to threads using atomic counter
/// 4. Each thread:
/// - Processes assigned tiles using micro-kernels
/// - Uses local accumulation buffers
/// - Writes results back to main memory
///
/// Parameters:
/// - allocator: Memory allocator for thread management
/// - A: Input matrix A (M x K)
/// - B: Input matrix B (K x N)
/// - C: Output matrix C (M x N)
/// - M: Number of rows in A and C
/// - N: Number of columns in B and C
/// - K: Number of columns in A / rows in B
///
/// Returns: Error if thread creation fails
pub fn tiledMatMul(allocator: std.mem.Allocator, A: []const f32, B: []const f32, C: []f32, M: usize, N: usize, K: usize) !void {
// Calculate tile grid dimensions
const tiles_M = (M + T - 1) / T; // Ceiling division for M dimension
const tiles_N = (N + T - 1) / T; // Ceiling division for N dimension
const total_tiles = tiles_M * tiles_N;
// Initialize shared atomic counter for work distribution
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
// Get number of available CPU cores
const num_threads = try std.Thread.getCpuCount();
// Create thread pool
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
defer thread_pool.deinit();
// Spawn worker threads
for (0..num_threads) |_| {
const context = ThreadContext{
.A = A,
.B = B,
.C = C,
.M = M,
.N = N,
.K = K,
.tiles_M = tiles_M,
.tiles_N = tiles_N,
.total_tiles = total_tiles,
.shared_counter = &shared_data,
};
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
}
// Wait for all threads to complete
for (thread_pool.items) |thread| thread.join();
}
///////////////////////////////////////////////////////////////////////////////
// Worker Thread Implementation
///////////////////////////////////////////////////////////////////////////////
/// Worker thread function that processes matrix tiles in parallel
///
/// Algorithm overview:
/// 1. Initializes local accumulator buffer aligned for AVX2
/// 2. Fetches work chunks using atomic counter
/// 3. For each tile:
/// - Calculates tile boundaries
/// - Processes tile using micro-kernel
/// - Accumulates results locally
/// - Writes results back to global memory
///
/// Work distribution:
/// - Uses atomic fetch-add for load balancing
/// - Processes CHUNK_SIZE tiles per fetch
/// - Continues until all tiles are processed
///
/// Memory access pattern:
/// - Uses local accumulator to minimize global memory traffic
/// - Implements streaming stores for result writeback
/// - Aligns data for efficient SIMD operations
///
/// Parameters:
/// - ctx: ThreadContext containing matrices and dimensions
fn workerThread(ctx: ThreadContext) void {
// Initialize local accumulator buffer, aligned for AVX2 operations
var local_C: [T][T]f32 align(AVX2_ALIGNMENT) = undefined;
while (true) {
// Atomically fetch next chunk of work
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
if (start_idx >= ctx.total_tiles) break;
// Calculate end of current chunk
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
var idx = start_idx;
// Process all tiles in current chunk
while (idx < end_idx) : (idx += 1) {
// Convert linear index to 2D tile coordinates
const i = idx / ctx.tiles_N; // Tile row index
const j = idx % ctx.tiles_N; // Tile column index
// Calculate tile boundaries in original matrix
const i_start = i * T;
const j_start = j * T;
const i_end = @min(i_start + T, ctx.M); // Handle edge tiles
const j_end = @min(j_start + T, ctx.N); // Handle edge tiles
// Clear local accumulator buffer
for (0..T) |x| {
@memset(&local_C[x], 0);
}
// Process tile using T×T blocks along K dimension
var k: usize = 0;
while (k < ctx.K) : (k += T) {
const k_end = @min(k + T, ctx.K);
microKernelAVX2(ctx, &local_C, i_start, j_start, k, i_end, j_end, k_end);
}
// Write accumulated results back to global memory using streaming stores
for (i_start..i_end) |ii| {
const row_offset = ii * ctx.N;
const local_row = ii - i_start;
var j_idx = j_start;
// Vector write-back for aligned portions using AVX2
while (j_idx + 8 <= j_end) : (j_idx += 8) {
const vec_idx = j_idx - j_start;
// Load 8 elements into vector
const c_vec = Vec8f{
local_C[local_row][vec_idx],
local_C[local_row][vec_idx + 1],
local_C[local_row][vec_idx + 2],
local_C[local_row][vec_idx + 3],
local_C[local_row][vec_idx + 4],
local_C[local_row][vec_idx + 5],
local_C[local_row][vec_idx + 6],
local_C[local_row][vec_idx + 7],
};
// Load destination elements
const dest_vec = Vec8f{
ctx.C[row_offset + j_idx],
ctx.C[row_offset + j_idx + 1],
ctx.C[row_offset + j_idx + 2],
ctx.C[row_offset + j_idx + 3],
ctx.C[row_offset + j_idx + 4],
ctx.C[row_offset + j_idx + 5],
ctx.C[row_offset + j_idx + 6],
ctx.C[row_offset + j_idx + 7],
};
// Add vectors and write back results
const result = dest_vec + c_vec;
for (0..8) |offset| {
ctx.C[row_offset + j_idx + offset] = result[offset];
}
}
// Handle remaining unaligned elements scalar
while (j_idx < j_end) : (j_idx += 1) {
ctx.C[row_offset + j_idx] += local_C[local_row][j_idx - j_start];
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Micro-Kernel Implementation
///////////////////////////////////////////////////////////////////////////////
/// High-performance micro-kernel for matrix multiplication using AVX2 SIMD instructions
///
/// Architecture-specific optimizations:
/// - Uses AVX2 SIMD instructions for 8-wide float operations
/// - Leverages FMA (Fused Multiply-Add) instructions
/// - Implements register blocking for maximum reuse
/// - Aligns data structures to cache lines
///
/// Algorithm breakdown:
/// 1. Loads tile data into aligned local buffers
/// 2. Processes 8x8 blocks using SIMD operations
/// 3. Handles edge cases with scalar operations
/// 4. Accumulates results with maximum precision
///
/// Memory optimization:
/// - Uses aligned local buffers for A and B matrices
/// - Implements software prefetching
/// - Maintains cache-friendly access patterns
/// - Minimizes cache pollution through blocking
///
/// Parameters:
/// - ctx: Thread context with matrix data and dimensions
/// - local_C: Local accumulator buffer for results
/// - i_start: Starting row index in global matrix
/// - j_start: Starting column index in global matrix
/// - k_start: Starting index in reduction dimension
/// - i_end: Ending row index (exclusive)
/// - j_end: Ending column index (exclusive)
/// - k_end: Ending reduction index (exclusive)
fn microKernelAVX2(
ctx: ThreadContext,
local_C: *[T][T]f32,
i_start: usize,
j_start: usize,
k_start: usize,
i_end: usize,
j_end: usize,
k_end: usize,
) void {
// Local tile buffers aligned for AVX2 operations
var A_local: [T][T]f32 align(32) = undefined;
var B_local: [T][T]f32 align(32) = undefined;
// Calculate tile dimensions
const k_size = k_end - k_start;
const i_size = i_end - i_start;
const j_size = j_end - j_start;
// Load A matrix tile with prefetching
// Stride-1 access pattern for optimal cache line utilization
for (0..i_size) |i| {
const src_idx = (i_start + i) * ctx.K + k_start;
for (0..k_size) |k| {
A_local[i][k] = ctx.A[src_idx + k];
}
}
// Load B matrix tile with prefetching
// Transforms memory layout for better cache access
for (0..k_size) |k| {
const src_idx = (k_start + k) * ctx.N + j_start;
for (0..j_size) |j| {
B_local[k][j] = ctx.B[src_idx + j];
}
}
// Process full 8x8 blocks using AVX2 SIMD
var i: usize = 0;
while (i + MICRO_KERNEL_SIZE <= i_size) : (i += MICRO_KERNEL_SIZE) {
var j: usize = 0;
while (j + MICRO_KERNEL_SIZE <= j_size) : (j += MICRO_KERNEL_SIZE) {
// Initialize accumulator for 8x8 block
var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} ** 8} ** 8;
// Inner loop - dot product with SIMD
var k: usize = 0;
while (k < k_size) : (k += 1) {
// Load 8 elements from A matrix into vector
const a_vec = Vec8f{
A_local[i][k], A_local[i + 1][k],
A_local[i + 2][k], A_local[i + 3][k],
A_local[i + 4][k], A_local[i + 5][k],
A_local[i + 6][k], A_local[i + 7][k],
};
// Load 8 elements from B matrix into vector
const b_vec = Vec8f{
B_local[k][j], B_local[k][j + 1],
B_local[k][j + 2], B_local[k][j + 3],
B_local[k][j + 4], B_local[k][j + 5],
B_local[k][j + 6], B_local[k][j + 7],
};
// Vectorized outer product using FMA
inline for (0..8) |bi| {
// Broadcast single A element to all lanes
const a_broadcast: Vec8f = @splat(a_vec[bi]);
// Load current accumulator values
const c_vec = Vec8f{
acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7],
};
// Fused multiply-add: acc += a * b
const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec);
// Store results back to accumulator
inline for (0..8) |bj| {
acc[bi][bj] = prod[bj];
}
}
}
// Accumulate 8x8 block results to main buffer
for (0..8) |bi| {
for (0..8) |bj| {
local_C[i + bi][j + bj] += acc[bi][bj];
}
}
}
// Handle edge case columns (< 8 columns remaining)
while (j < j_size) : (j += 1) {
for (0..8) |bi| {
var sum: f32 = 0;
// Scalar operations for edge cases
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i + bi][k], B_local[k][j], sum);
}
local_C[i + bi][j] += sum;
}
}
}
// Handle edge case rows (< 8 rows remaining)
while (i < i_size) : (i += 1) {
for (0..j_size) |j| {
var sum: f32 = 0;
// Scalar operations for edge cases
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i][k], B_local[k][j], sum);
}
local_C[i][j] += sum;
}
}
}
/// ------ BENCHMARKING AND PERFORMANCE ANALYSIS ------ ///
const CpuInfo = struct {
p_freq: ?u64 = null,
e_freq: ?u64 = null,
p_cores: ?u32 = null,
e_cores: ?u32 = null,
simd_width: u32,
total_cores: u32,
supports_peak_calc: bool,
};
fn getSIMDWidth() u32 {
if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
return 16;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
return 8;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
return 4;
} else {
return 1;
}
}
fn getCpuFrequencies() !CpuInfo {
var info = CpuInfo{
.simd_width = getSIMDWidth(),
.total_cores = @intCast(try std.Thread.getCpuCount()),
.supports_peak_calc = false,
};
switch (builtin.os.tag) {
.linux => {
// Try reading hybrid architecture core mapping
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| {
defer p_core_file.close();
var buf: [32]u8 = undefined;
if (p_core_file.reader().readAll(&buf)) |bytes_read| {
const p_cores_str = buf[0..bytes_read];
// Count cores from range (e.g. "0-7" means 8 threads)
if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| {
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
}
} else |_| {}
} else |_| {}
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| {
defer e_core_file.close();
var buf: [32]u8 = undefined;
if (e_core_file.reader().readAll(&buf)) |bytes_read| {
const e_cores_str = buf[0..bytes_read];
// Count cores from range (e.g. "8-15" means 8 threads)
if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| {
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
}
} else |_| {}
} else |_| {}
// Try reading frequencies for P-cores
if (info.p_cores != null) {
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| {
defer freq_file.close();
var buf: [32]u8 = undefined;
if (freq_file.reader().readAll(&buf)) |bytes_read| {
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
info.p_freq = freq * 1000; // Convert kHz to Hz
} else |_| {
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
}
} else |_| {}
} else |_| {
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
}
}
// Try reading frequencies for E-cores
if (info.e_cores != null) {
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| {
defer freq_file.close();
var buf: [32]u8 = undefined;
if (freq_file.reader().readAll(&buf)) |bytes_read| {
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
info.e_freq = freq * 1000; // Convert kHz to Hz
} else |_| {
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
}
} else |_| {}
} else |_| {
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
}
}
// If we detected any cores, we can do peak calculation
info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
(info.e_cores != null and info.e_freq != null);
// If we couldn't detect anything, fall back to treating all cores as P-cores
if (!info.supports_peak_calc) {
info.p_cores = info.total_cores;
info.e_cores = 0;
info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
info.supports_peak_calc = true;
std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
}
},
.windows, .macos => {
info.p_cores = info.total_cores;
info.e_cores = 0;
info.supports_peak_calc = false;
},
else => {
@compileError("Unsupported operating system");
},
}
return info;
}
fn calculateTheoreticalPeak(info: CpuInfo) ?f64 {
if (!info.supports_peak_calc) {
return null;
}
const p_core_gflops = if (info.p_freq != null and info.p_cores != null)
(@as(f64, @floatFromInt(info.p_freq.?)) / 1e9) *
@as(f64, @floatFromInt(info.p_cores.? * info.simd_width * 2))
else
0;
const e_core_gflops = if (info.e_freq != null and info.e_cores != null)
(@as(f64, @floatFromInt(info.e_freq.?)) / 1e9) *
@as(f64, @floatFromInt(info.e_cores.? * info.simd_width * 2))
else
0;
return p_core_gflops + e_core_gflops;
}
pub fn calculatePreciseGflops(allocator: std.mem.Allocator, M: usize, N: usize, K: usize, iterations: usize) !void {
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
// Initialize matrices
var rng = std.rand.DefaultPrng.init(@intCast(std.time.timestamp()));
const random = rng.random();
for (A) |*val| {
val.* = random.float(f32) * 2.0 - 1.0;
}
for (B) |*val| {
val.* = random.float(f32) * 2.0 - 1.0;
}
@memset(C, 0);
// Warmup run
try tiledMatMul(allocator, A, B, C, M, N, K);
// Track performance metrics
const ops_per_element = 2 * K - 1;
const total_elements = M * N;
const flops_per_iteration = total_elements * ops_per_element;
var metrics = try allocator.alloc(struct {
gflops: f64,
time_ms: f64,
}, iterations);
defer allocator.free(metrics);
// Main measurement loop
var timer = try std.time.Timer.start();
const total_start = timer.read();
for (0..iterations) |i| {
@memset(C, 0);
const iter_start = timer.read();
try tiledMatMul(allocator, A, B, C, M, N, K);
const iter_end = timer.read();
const iter_time_ns = iter_end - iter_start;
const iter_time_ms = @as(f64, @floatFromInt(iter_time_ns)) / 1e6;
const iter_gflops = @as(f64, @floatFromInt(flops_per_iteration)) / (@as(f64, @floatFromInt(iter_time_ns)) / 1e9) / 1e9;
metrics[i] = .{
.gflops = iter_gflops,
.time_ms = iter_time_ms,
};
}
// Calculate statistics
const total_time_ns = timer.read() - total_start;
const total_time_sec = @as(f64, @floatFromInt(total_time_ns)) / 1e9;
var min_gflops: f64 = metrics[0].gflops;
var max_gflops: f64 = metrics[0].gflops;
var total_gflops: f64 = 0;
var first_quarter_avg: f64 = 0;
var last_quarter_avg: f64 = 0;
const quarter_size = iterations / 4;
for (metrics, 0..) |m, i| {
min_gflops = @min(min_gflops, m.gflops);
max_gflops = @max(max_gflops, m.gflops);
total_gflops += m.gflops;
if (i < quarter_size) {
first_quarter_avg += m.gflops;
} else if (i >= iterations - quarter_size) {
last_quarter_avg += m.gflops;
}
}
first_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
last_quarter_avg /= @as(f64, @floatFromInt(quarter_size));
const avg_gflops = total_gflops / @as(f64, @floatFromInt(iterations));
const perf_change_pct = ((last_quarter_avg - first_quarter_avg) / first_quarter_avg) * 100.0;
// Get CPU info and theoretical peak
const cpu_info = try getCpuFrequencies();
// Print detailed performance report
std.debug.print("\nPerformance Analysis for {d}x{d}x{d} Matrix:\n", .{ M, N, K });
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Peak Performance: {d:.2} GFLOPS\n", .{max_gflops});
std.debug.print("Lowest Performance: {d:.2} GFLOPS\n", .{min_gflops});
std.debug.print("Average Performance: {d:.2} GFLOPS\n", .{avg_gflops});
std.debug.print("First Quarter Avg: {d:.2} GFLOPS\n", .{first_quarter_avg});
std.debug.print("Last Quarter Avg: {d:.2} GFLOPS\n", .{last_quarter_avg});
if (perf_change_pct >= 0) {
std.debug.print("Performance Change: {d:.1}% (improvement)\n", .{perf_change_pct});
} else {
std.debug.print("Performance Change: {d:.1}% (degradation)\n", .{perf_change_pct});
}
std.debug.print("Total Runtime: {d:.3} seconds\n", .{total_time_sec});
// CPU information and theoretical peak
std.debug.print("\nCPU Configuration:\n", .{});
std.debug.print("Total Cores: {d}\n", .{cpu_info.total_cores});
if (cpu_info.p_cores != null) {
std.debug.print("P-Cores: {d}\n", .{cpu_info.p_cores.?});
}
if (cpu_info.e_cores != null) {
std.debug.print("E-Cores: {d}\n", .{cpu_info.e_cores.?});
}
std.debug.print("SIMD Width: {d} floats\n", .{cpu_info.simd_width});
if (cpu_info.supports_peak_calc) {
if (cpu_info.p_freq != null) {
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
}
if (cpu_info.e_freq != null) {
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
}
if (calculateTheoreticalPeak(cpu_info)) |peak| {
std.debug.print("\nTheoretical Peak: {d:.2} GFLOPS\n", .{peak});
if (peak > 0) {
std.debug.print("Peak Efficiency: {d:.1}% of theoretical\n", .{(max_gflops / peak) * 100.0});
}
}
} else {
std.debug.print("\nNote: Theoretical peak calculation not supported on this OS\n", .{});
}
// Print performance distribution
std.debug.print("\nPerformance Distribution:\n", .{});
const distribution_width = 60;
for (metrics) |m| {
const normalized = (m.gflops - min_gflops) / (max_gflops - min_gflops);
const stars = @as(usize, @intFromFloat(normalized * @as(f64, @floatFromInt(distribution_width))));
std.debug.print("{d:.2} GFLOPS |", .{m.gflops});
for (0..stars) |_| {
std.debug.print("*", .{});
}
std.debug.print("\n", .{});
}
}
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
const sizes = [_][3]usize{
// Uncomment additional sizes as needed
// .{ 256, 256, 256 },
// .{ 512, 512, 512 },
// .{ 1024, 1024, 1024 },
// .{ 1024, 2048, 1024 },
.{ 2048, 2048, 2048 },
};
const iterations = 100;
// Print system configuration
const cpu_info = try getCpuFrequencies();
std.debug.print("\nSystem Configuration:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Tile size (T): {}\n", .{T});
std.debug.print("Vector size (V): {}\n", .{V});
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores});
if (cpu_info.supports_peak_calc) {
if (cpu_info.p_freq != null) {
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
}
if (cpu_info.e_freq != null) {
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
}
}
std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)});
std.debug.print("Iterations per test: {}\n", .{iterations});
std.debug.print("----------------------------------------\n\n", .{});
// Run benchmarks with increasing matrix sizes
for (sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
std.debug.print("\n========================================\n", .{});
std.debug.print("Running benchmark for {d}x{d}x{d} matrices\n", .{ M, N, K });
std.debug.print("========================================\n", .{});
try calculatePreciseGflops(allocator, M, N, K, iterations);
// Add a small delay between tests to let CPU cool down
std.time.sleep(1 * std.time.ns_per_s);
}
// Print summary
std.debug.print("\n========================================\n", .{});
std.debug.print("Benchmark Summary:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
std.debug.print("Total iterations per size: {d}\n", .{iterations});
std.debug.print("AVX2 SIMD utilized: Yes\n", .{});
std.debug.print("Tiled algorithm: Yes ({}x{})\n", .{ T, T });
std.debug.print("========================================\n\n", .{});
}
// Additional test for edge cases
test "tiledMatMul edge cases" {
const allocator = std.testing.allocator;
// Test with sizes that are not multiples of tile size
const edge_sizes = [_][3]usize{
.{ T + 1, T - 1, T + 2 }, // Around tile size
.{ T * 2 - 1, T * 2 + 1, T * 2 }, // Around double tile size
.{ 1, T, T }, // Single row
.{ T, 1, T }, // Single column
.{ T, T, 1 }, // Single depth
};
for (edge_sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
const C_ref = try allocator.alloc(f32, M * N);
defer allocator.free(C_ref);
// Initialize matrices
for (A, 0..) |*val, i| {
val.* = @floatFromInt(i % 7);
}
for (B, 0..) |*val, i| {
val.* = @floatFromInt((i + 1) % 5);
}
@memset(C, 0);
@memset(C_ref, 0);
// Perform multiplications
try tiledMatMul(allocator, A, B, C, M, N, K);
// Reference calculation
for (0..M) |i| {
for (0..N) |j| {
var sum: f32 = 0;
for (0..K) |k| {
sum += A[i * K + k] * B[k * N + j];
}
C_ref[i * N + j] = sum;
}
}
// Compare results with higher tolerance for edge cases
for (C, C_ref) |c, c_ref| {
try std.testing.expectApproxEqAbs(c, c_ref, 1e-4);
}
std.debug.print("Edge case test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
}
}
// Example usage
test "matrix multiplication" {
const allocator = std.testing.allocator;
const M: usize = 1024;
const N: usize = 1024;
const K: usize = 1024;
var A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
var B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
// Initialize matrices
for (0..M * K) |i| {
A[i] = @floatFromInt(i % 7);
}
for (0..K * N) |i| {
B[i] = @floatFromInt(i % 5);
}
@memset(C, 0);
try tiledMatMul(allocator, A, B, C, M, N, K);
}
// Test function
test "tiledMatMul correctness" {
const allocator = std.testing.allocator;
const test_sizes = [_][3]usize{
.{ 128, 128, 128 },
.{ 100, 100, 100 },
.{ 200, 150, 175 },
.{ 32, 64, 48 },
.{ 47, 34, 45 },
};
for (test_sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
const C_ref = try allocator.alloc(f32, M * N);
defer allocator.free(C_ref);
// Initialize matrices
for (A, 0..) |*val, i| {
val.* = @floatFromInt(i % 10);
}
for (B, 0..) |*val, i| {
val.* = @floatFromInt((i + 1) % 10);
}
@memset(C, 0);
@memset(C_ref, 0);
// Perform tiled matrix multiplication
try tiledMatMul(allocator, A, B, C, M, N, K);
// Perform reference matrix multiplication
for (0..M) |i| {
for (0..N) |j| {
var sum: f32 = 0;
for (0..K) |k| {
sum += A[i * K + k] * B[k * N + j];
}
C_ref[i * N + j] = sum;
}
}
// Compare results
for (C, C_ref) |c, c_ref| {
try std.testing.expectApproxEqAbs(c, c_ref, 1e-5);
}
std.debug.print("Test passed for size: M={}, N={}, K={}\n", .{ M, N, K });
}
}
const std = @import("std");
const builtin = @import("builtin");
const atomic = std.atomic;
const CACHE_LINE_SIZE: usize = atomic.cache_line;
const CHUNK_SIZE: usize = 1;
const AVX2_ALIGNMENT = 32;
const Vec8f = @Vector(8, f32);
const ThreadLocalData = struct {
current_index: atomic.Value(usize) align(CACHE_LINE_SIZE),
_padding: [CACHE_LINE_SIZE - @sizeOf(atomic.Value(usize))]u8 = undefined,
};
pub fn tiledMatMul(
allocator: std.mem.Allocator,
A: []const f32,
B: []const f32,
C: []f32,
M: usize,
N: usize,
K: usize,
T: usize,
V: usize,
) !void {
const tiles_M = (M + T - 1) / T;
const tiles_N = (N + T - 1) / T;
const total_tiles = tiles_M * tiles_N;
var shared_data = ThreadLocalData{ .current_index = atomic.Value(usize).init(0) };
const num_threads = try std.Thread.getCpuCount();
// Allocate thread pool
var thread_pool = try std.ArrayList(std.Thread).initCapacity(allocator, num_threads);
defer thread_pool.deinit();
// Allocate local C buffers for each thread
const buffer_size = T * T;
var local_C_buffers = try allocator.alloc(f32, buffer_size * num_threads);
defer allocator.free(local_C_buffers);
// Create and spawn threads
for (0..num_threads) |thread_id| {
const thread_buffer = local_C_buffers[thread_id * buffer_size .. (thread_id + 1) * buffer_size];
const context = ThreadContext{
.A = A,
.B = B,
.C = C,
.M = M,
.N = N,
.K = K,
.T = T,
.V = V,
.tiles_M = tiles_M,
.tiles_N = tiles_N,
.total_tiles = total_tiles,
.shared_counter = &shared_data,
.local_C = thread_buffer,
};
try thread_pool.append(try std.Thread.spawn(.{}, workerThread, .{context}));
}
// Wait for all threads to complete
for (thread_pool.items) |thread| thread.join();
}
const ThreadContext = struct {
A: []const f32,
B: []const f32,
C: []f32,
M: usize,
N: usize,
K: usize,
T: usize,
V: usize,
tiles_M: usize,
tiles_N: usize,
total_tiles: usize,
shared_counter: *ThreadLocalData,
local_C: []f32,
};
fn workerThread(ctx: ThreadContext) void {
const max_tile_size = 512; // Maximum supported tile size
var A_local: [max_tile_size * max_tile_size]f32 align(32) = undefined;
var B_local: [max_tile_size * max_tile_size]f32 align(32) = undefined;
if (ctx.T > max_tile_size) {
@panic("Tile size exceeds maximum supported size");
}
while (true) {
const start_idx = ctx.shared_counter.current_index.fetchAdd(CHUNK_SIZE, .seq_cst);
if (start_idx >= ctx.total_tiles) break;
const end_idx = @min(start_idx + CHUNK_SIZE, ctx.total_tiles);
var idx = start_idx;
while (idx < end_idx) : (idx += 1) {
const i = idx / ctx.tiles_N;
const j = idx % ctx.tiles_N;
const i_start = i * ctx.T;
const j_start = j * ctx.T;
const i_end = @min(i_start + ctx.T, ctx.M);
const j_end = @min(j_start + ctx.T, ctx.N);
// Reset local_C
@memset(ctx.local_C, 0);
var k: usize = 0;
while (k < ctx.K) : (k += ctx.T) {
const k_end = @min(k + ctx.T, ctx.K);
microKernelAVX2(ctx, &A_local, &B_local, i_start, j_start, k, i_end, j_end, k_end);
}
// Update global C
for (i_start..i_end) |ii| {
const row_offset = ii * ctx.N;
const local_row = (ii - i_start) * ctx.T;
var j_idx = j_start;
while (j_idx + ctx.V <= j_end) : (j_idx += ctx.V) {
if (ctx.V == 8) {
const vec_idx = j_idx - j_start;
const c_vec = Vec8f{
ctx.local_C[local_row + vec_idx],
ctx.local_C[local_row + vec_idx + 1],
ctx.local_C[local_row + vec_idx + 2],
ctx.local_C[local_row + vec_idx + 3],
ctx.local_C[local_row + vec_idx + 4],
ctx.local_C[local_row + vec_idx + 5],
ctx.local_C[local_row + vec_idx + 6],
ctx.local_C[local_row + vec_idx + 7],
};
const dest_vec = Vec8f{
ctx.C[row_offset + j_idx],
ctx.C[row_offset + j_idx + 1],
ctx.C[row_offset + j_idx + 2],
ctx.C[row_offset + j_idx + 3],
ctx.C[row_offset + j_idx + 4],
ctx.C[row_offset + j_idx + 5],
ctx.C[row_offset + j_idx + 6],
ctx.C[row_offset + j_idx + 7],
};
const result = dest_vec + c_vec;
for (0..8) |offset| {
ctx.C[row_offset + j_idx + offset] = result[offset];
}
} else {
// Handle other vector sizes
for (0..ctx.V) |offset| {
ctx.C[row_offset + j_idx + offset] +=
ctx.local_C[local_row + (j_idx - j_start) + offset];
}
}
}
// Handle remaining elements
while (j_idx < j_end) : (j_idx += 1) {
ctx.C[row_offset + j_idx] += ctx.local_C[local_row + (j_idx - j_start)];
}
}
}
}
}
fn microKernelAVX2(
ctx: ThreadContext,
A_local: []f32,
B_local: []f32,
i_start: usize,
j_start: usize,
k_start: usize,
i_end: usize,
j_end: usize,
k_end: usize,
) void {
const k_size = k_end - k_start;
const i_size = i_end - i_start;
const j_size = j_end - j_start;
// Load A and B tiles
for (0..i_size) |i| {
const src_idx = (i_start + i) * ctx.K + k_start;
const local_idx = i * ctx.T;
for (0..k_size) |k| {
A_local[local_idx + k] = ctx.A[src_idx + k];
}
}
for (0..k_size) |k| {
const src_idx = (k_start + k) * ctx.N + j_start;
const local_idx = k * ctx.T;
for (0..j_size) |j| {
B_local[local_idx + j] = ctx.B[src_idx + j];
}
}
var i: usize = 0;
while (i + ctx.V <= i_size) : (i += ctx.V) {
var j: usize = 0;
while (j + ctx.V <= j_size) : (j += ctx.V) {
if (ctx.V == 8) {
var acc: [8][8]f32 align(32) = [_][8]f32{[_]f32{0} ** 8} ** 8;
for (0..k_size) |k| {
const a_vec = Vec8f{
A_local[i * ctx.T + k],
A_local[(i + 1) * ctx.T + k],
A_local[(i + 2) * ctx.T + k],
A_local[(i + 3) * ctx.T + k],
A_local[(i + 4) * ctx.T + k],
A_local[(i + 5) * ctx.T + k],
A_local[(i + 6) * ctx.T + k],
A_local[(i + 7) * ctx.T + k],
};
const b_vec = Vec8f{
B_local[k * ctx.T + j],
B_local[k * ctx.T + j + 1],
B_local[k * ctx.T + j + 2],
B_local[k * ctx.T + j + 3],
B_local[k * ctx.T + j + 4],
B_local[k * ctx.T + j + 5],
B_local[k * ctx.T + j + 6],
B_local[k * ctx.T + j + 7],
};
inline for (0..8) |bi| {
const a_broadcast: Vec8f = @splat(a_vec[bi]);
const c_vec = Vec8f{
acc[bi][0], acc[bi][1], acc[bi][2], acc[bi][3],
acc[bi][4], acc[bi][5], acc[bi][6], acc[bi][7],
};
const prod = @mulAdd(Vec8f, a_broadcast, b_vec, c_vec);
inline for (0..8) |bj| {
acc[bi][bj] = prod[bj];
}
}
}
// Store results
for (0..8) |bi| {
for (0..8) |bj| {
const local_idx = (i + bi) * ctx.T + (j + bj);
ctx.local_C[local_idx] += acc[bi][bj];
}
}
} else {
// Handle other vector sizes with scalar operations
for (0..ctx.V) |bi| {
for (0..ctx.V) |bj| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j + bj], sum);
}
ctx.local_C[(i + bi) * ctx.T + j + bj] += sum;
}
}
}
}
// Handle remaining columns
while (j < j_size) : (j += 1) {
for (0..ctx.V) |bi| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[(i + bi) * ctx.T + k], B_local[k * ctx.T + j], sum);
}
ctx.local_C[(i + bi) * ctx.T + j] += sum;
}
}
}
// Handle remaining rows
while (i < i_size) : (i += 1) {
for (0..j_size) |j| {
var sum: f32 = 0;
for (0..k_size) |k| {
sum = @mulAdd(f32, A_local[i * ctx.T + k], B_local[k * ctx.T + j], sum);
}
ctx.local_C[i * ctx.T + j] += sum;
}
}
}
const time = std.time;
const testing = std.testing;
pub const SearchResult = struct {
tile_size: usize,
vector_size: usize,
gflops: f64,
};
const SearchRange = struct {
t_min: usize,
t_max: usize,
t_step: usize,
v_sizes: []const usize,
};
pub fn adaptiveGridSearch(
allocator: std.mem.Allocator,
M: usize,
N: usize,
K: usize,
initial_range: SearchRange,
refinement_levels: usize,
) !SearchResult {
var best_result = SearchResult{
.tile_size = 0,
.vector_size = 0,
.gflops = 0,
};
var current_range = initial_range;
for (0..refinement_levels) |level| {
std.debug.print("\nLevel {d} Search (T: {d}-{d}, step: {d})\n", .{ level, current_range.t_min, current_range.t_max, current_range.t_step });
std.debug.print("----------------------------------------\n", .{});
// Search current range
var results = std.ArrayList(SearchResult).init(allocator);
defer results.deinit();
var t = current_range.t_min;
while (t <= current_range.t_max) : (t += current_range.t_step) {
// Skip invalid tile sizes
if (t > M or t > N or t > K) continue;
for (current_range.v_sizes) |v| {
if (t % v != 0) continue; // Skip if tile size not divisible by vector size
// Run multiple trials and take best
var best_trial = try benchmarkMatMul(allocator, t, v, M, N, K);
for (0..2) |_| {
const trial = try benchmarkMatMul(allocator, t, v, M, N, K);
if (trial.gflops > best_trial.gflops) {
best_trial.gflops = trial.gflops;
}
}
try results.append(.{
.tile_size = t,
.vector_size = v,
.gflops = best_trial.gflops,
});
std.debug.print("T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ t, v, best_trial.gflops });
}
}
// Find top performers
const top_k = 3; // Number of top results to consider
const top_results = try allocator.alloc(SearchResult, @min(top_k, results.items.len));
defer allocator.free(top_results);
// Sort results by GFLOPS (descending)
std.sort.pdq(SearchResult, results.items, {}, struct {
fn lessThan(_: void, a: SearchResult, b: SearchResult) bool {
return a.gflops > b.gflops;
}
}.lessThan);
// Copy top results
@memcpy(top_results, results.items[0..top_results.len]);
// Update best overall result
if (results.items[0].gflops > best_result.gflops) {
best_result = results.items[0];
}
// Print top performers for this level
std.debug.print("\nTop performers at level {d}:\n", .{level});
for (top_results, 0..) |result, i| {
std.debug.print("{d}. T={d:<4} V={d:<3}: {d:.2} GFLOPS\n", .{ i + 1, result.tile_size, result.vector_size, result.gflops });
}
// Refine search range around best result
if (level + 1 < refinement_levels) {
const best_T = top_results[0].tile_size;
const step = current_range.t_step / 2;
current_range = SearchRange{
.t_min = if (best_T > step * 2) best_T - step * 2 else current_range.t_min,
.t_max = @min(best_T + step * 2, current_range.t_max),
.t_step = step,
.v_sizes = current_range.v_sizes,
};
}
}
return best_result;
}
pub fn treeSearch(
allocator: std.mem.Allocator,
M: usize,
N: usize,
K: usize,
) !SearchResult {
// Initial coarse-grained search
const initial_range = SearchRange{
.t_min = 32,
.t_max = 512,
.t_step = 64,
.v_sizes = &[_]usize{ 4, 8, 16 },
};
std.debug.print("Starting tree search for {d}x{d}x{d} matrix\n", .{ M, N, K });
std.debug.print("----------------------------------------\n", .{});
const result = try adaptiveGridSearch(allocator, M, N, K, initial_range, 4 // number of refinement levels
);
std.debug.print("\nFinal optimal configuration:\n", .{});
std.debug.print("Tile Size (T): {d}\n", .{result.tile_size});
std.debug.print("Vector Size (V): {d}\n", .{result.vector_size});
std.debug.print("Performance: {d:.2} GFLOPS\n", .{result.gflops});
return result;
}
pub fn benchmarkMatMul(
allocator: std.mem.Allocator,
tile_size: usize,
vector_size: usize,
M: usize,
N: usize,
K: usize,
) !SearchResult {
const A = try allocator.alloc(f32, M * K);
defer allocator.free(A);
const B = try allocator.alloc(f32, K * N);
defer allocator.free(B);
const C = try allocator.alloc(f32, M * N);
defer allocator.free(C);
// Initialize matrices with random data
var rng = std.rand.DefaultPrng.init(0);
const random = rng.random();
for (A) |*val| val.* = random.float(f32);
for (B) |*val| val.* = random.float(f32);
@memset(C, 0);
// Warm up run
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
// Actual benchmark
const start_time = time.nanoTimestamp();
try tiledMatMul(allocator, A, B, C, M, N, K, tile_size, vector_size);
const end_time = time.nanoTimestamp();
const elapsed_ns: u64 = @intCast(end_time - start_time);
// Calculate GFLOPS
const operations = @as(f64, @floatFromInt(2 * M * N * K)); // 2 operations per multiply-add
const seconds = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
const gflops = operations / (seconds * 1e9);
return SearchResult{
.tile_size = tile_size,
.vector_size = vector_size,
.gflops = gflops,
};
}
const CpuInfo = struct {
p_freq: ?u64 = null,
e_freq: ?u64 = null,
p_cores: ?u32 = null,
e_cores: ?u32 = null,
simd_width: u32,
total_cores: u32,
supports_peak_calc: bool,
};
fn getSIMDWidth() u32 {
if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f)) {
return 16;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) {
return 8;
} else if (std.Target.x86.featureSetHas(builtin.cpu.features, .sse2)) {
return 4;
} else {
return 1;
}
}
fn getCpuFrequencies() !CpuInfo {
var info = CpuInfo{
.simd_width = getSIMDWidth(),
.total_cores = @intCast(try std.Thread.getCpuCount()),
.supports_peak_calc = false,
};
switch (builtin.os.tag) {
.linux => {
// Try reading hybrid architecture core mapping
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/cpus", .{})) |p_core_file| {
defer p_core_file.close();
var buf: [32]u8 = undefined;
if (p_core_file.reader().readAll(&buf)) |bytes_read| {
const p_cores_str = buf[0..bytes_read];
// Count cores from range (e.g. "0-7" means 8 threads)
if (std.mem.indexOf(u8, p_cores_str, "-")) |dash_idx| {
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, p_cores_str[dash_idx + 1 ..], " \n\r"), 10);
const start_thread = try std.fmt.parseInt(u32, p_cores_str[0..dash_idx], 10);
info.p_cores = @divFloor(end_thread - start_thread + 1, 2); // Divide by 2 for hyperthreading
}
} else |_| {}
} else |_| {}
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/cpus", .{})) |e_core_file| {
defer e_core_file.close();
var buf: [32]u8 = undefined;
if (e_core_file.reader().readAll(&buf)) |bytes_read| {
const e_cores_str = buf[0..bytes_read];
// Count cores from range (e.g. "8-15" means 8 threads)
if (std.mem.indexOf(u8, e_cores_str, "-")) |dash_idx| {
const end_thread = try std.fmt.parseInt(u32, std.mem.trim(u8, e_cores_str[dash_idx + 1 ..], " \n\r"), 10);
const start_thread = try std.fmt.parseInt(u32, e_cores_str[0..dash_idx], 10);
info.e_cores = end_thread - start_thread + 1; // E-cores don't have hyperthreading
}
} else |_| {}
} else |_| {}
// Try reading frequencies for P-cores
if (info.p_cores != null) {
if (std.fs.openFileAbsolute("/sys/devices/cpu_core/frequency", .{})) |freq_file| {
defer freq_file.close();
var buf: [32]u8 = undefined;
if (freq_file.reader().readAll(&buf)) |bytes_read| {
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
info.p_freq = freq * 1000; // Convert kHz to Hz
} else |_| {
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
}
} else |_| {}
} else |_| {
info.p_freq = 4_500_000_000; // Default 4.5 GHz for P-cores
}
}
// Try reading frequencies for E-cores
if (info.e_cores != null) {
if (std.fs.openFileAbsolute("/sys/devices/cpu_atom/frequency", .{})) |freq_file| {
defer freq_file.close();
var buf: [32]u8 = undefined;
if (freq_file.reader().readAll(&buf)) |bytes_read| {
if (std.fmt.parseInt(u64, std.mem.trim(u8, buf[0..bytes_read], " \n\r"), 10)) |freq| {
info.e_freq = freq * 1000; // Convert kHz to Hz
} else |_| {
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
}
} else |_| {}
} else |_| {
info.e_freq = 3_500_000_000; // Default 3.5 GHz for E-cores
}
}
// If we detected any cores, we can do peak calculation
info.supports_peak_calc = (info.p_cores != null and info.p_freq != null) or
(info.e_cores != null and info.e_freq != null);
// If we couldn't detect anything, fall back to treating all cores as P-cores
if (!info.supports_peak_calc) {
info.p_cores = info.total_cores;
info.e_cores = 0;
info.p_freq = 3_500_000_000; // Conservative 3.5 GHz default
info.supports_peak_calc = true;
std.debug.print("Warning: Could not detect hybrid architecture. Assuming all P-cores at 3.5 GHz\n", .{});
}
},
.windows, .macos => {
info.p_cores = info.total_cores;
info.e_cores = 0;
info.supports_peak_calc = false;
},
else => {
@compileError("Unsupported operating system");
},
}
return info;
}
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
const sizes = [_][3]usize{
// Uncomment additional sizes as needed
.{ 1024, 1024, 1024 },
// .{ 2048, 2048, 2048 },
// .{ 4096, 4096, 4096 },
};
const iterations = 5; // Reduced for tree search phase, increase for final benchmark
// Print system configuration
const cpu_info = try getCpuFrequencies();
std.debug.print("\nSystem Configuration:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("CPU Cores: {}\n", .{cpu_info.total_cores});
if (cpu_info.supports_peak_calc) {
if (cpu_info.p_freq != null) {
std.debug.print("P-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.p_freq.?)) / 1e9});
}
if (cpu_info.e_freq != null) {
std.debug.print("E-Core Frequency: {d:.2} GHz\n", .{@as(f64, @floatFromInt(cpu_info.e_freq.?)) / 1e9});
}
}
std.debug.print("AVX2 Vector Width: {}\n", .{@sizeOf(Vec8f) / @sizeOf(f32)});
std.debug.print("----------------------------------------\n\n", .{});
for (sizes) |size| {
const M = size[0];
const N = size[1];
const K = size[2];
std.debug.print("\n========================================\n", .{});
std.debug.print("Matrix Size: {d}x{d}x{d}\n", .{ M, N, K });
std.debug.print("========================================\n", .{});
// Phase 1: Tree Search
std.debug.print("\nAdaptive Parameter Search\n", .{});
std.debug.print("----------------------------------------\n", .{});
const best_config = try treeSearch(allocator, M, N, K);
std.debug.print("\nBest configuration found:\n", .{});
std.debug.print("Tile Size (T): {d}\n", .{best_config.tile_size});
std.debug.print("Vector Size (V): {d}\n", .{best_config.vector_size});
std.debug.print("Initial Performance: {d:.2} GFLOPS\n\n", .{best_config.gflops});
}
// Print summary
std.debug.print("\n========================================\n", .{});
std.debug.print("Benchmark Summary:\n", .{});
std.debug.print("----------------------------------------\n", .{});
std.debug.print("Total matrix sizes tested: {d}\n", .{sizes.len});
std.debug.print("Tree search levels: 4\n", .{});
std.debug.print("Detailed analysis iterations: {d}\n", .{iterations});
std.debug.print("AVX2 SIMD utilized: Yes\n", .{});
std.debug.print("========================================\n\n", .{});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment