Last active
June 17, 2024 14:01
-
-
Save Rexicon226/cb433c0a8639345e15224fd16cd66140 to your computer and use it in GitHub Desktop.
optimized memeql benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// zig build-exe benchmark.zig -OReleaseFast -lc | |
// ./benchmark 4096 | |
const std = @import("std"); | |
const allocator = std.heap.c_allocator; | |
const iterations_per_byte = 1000; | |
const warmup_iterations = 10; | |
comptime { | |
asm (@embedFile("./asm_eql.S")); | |
} | |
extern fn asm_eql(a: [*]const u8, a_len: usize, b: [*]const u8, b_len: usize) bool; | |
pub fn main() !void { | |
// Pin the process to a single core (1) | |
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1)); | |
try std.os.linux.sched_setaffinity(0, &cpu0001); | |
const stdout = std.io.getStdOut(); | |
var progress: std.Progress = .{ .dont_print_on_dumb = true }; | |
const loops = try std.process.argsAlloc(allocator); | |
defer std.process.argsFree(allocator, loops); | |
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10); | |
const main_progress_node = progress.start("Progress", max_bytes); | |
defer main_progress_node.end(); | |
const ty = u8; | |
for (1..max_bytes) |index| { | |
const same_buffer_a = try allocator.alloc(ty, index); | |
@memset(same_buffer_a, 0); | |
const same_buffer_b = try allocator.alloc(ty, index); | |
@memset(same_buffer_b, 0); | |
const middle_buffer_a = try allocator.alloc(ty, index); | |
@memset(middle_buffer_a, 0); | |
middle_buffer_a[index / 2] = 1; | |
const middle_buffer_b = try allocator.alloc(ty, index); | |
@memset(middle_buffer_b, 0); | |
const start_buffer_a = try allocator.alloc(ty, index); | |
@memset(start_buffer_a, 0); | |
start_buffer_a[0] = 1; | |
const start_buffer_b = try allocator.alloc(ty, index); | |
@memset(start_buffer_b, 0); | |
var same_new_cycles: u64 = 0; | |
const new_fn = eql; | |
const old_fn = stdlib_eql; | |
clflush(ty, same_buffer_a); | |
clflush(ty, same_buffer_b); | |
var same_i: u32 = 0; | |
while (same_i < iterations_per_byte + warmup_iterations) : (same_i += 1) { | |
const new_start = rdtsc(); | |
std.mem.doNotOptimizeAway(new_fn(ty, same_buffer_a, same_buffer_b)); | |
const new_end = rdtsc(); | |
if (same_i > warmup_iterations) same_new_cycles += (new_end - new_start); | |
} | |
var same_old_cycles: u64 = 0; | |
clflush(ty, same_buffer_a); | |
clflush(ty, same_buffer_b); | |
var same_j: u32 = 0; | |
while (same_j < iterations_per_byte + warmup_iterations) : (same_j += 1) { | |
const old_start = rdtsc(); | |
std.mem.doNotOptimizeAway(old_fn(ty, same_buffer_a, same_buffer_b)); | |
const old_end = rdtsc(); | |
if (same_j > warmup_iterations) same_old_cycles += (old_end - old_start); | |
} | |
var middle_new_cycles: u64 = 0; | |
clflush(ty, same_buffer_a); | |
clflush(ty, same_buffer_b); | |
var middle_i: u32 = 0; | |
while (middle_i < iterations_per_byte + warmup_iterations) : (middle_i += 1) { | |
const new_start = rdtsc(); | |
std.mem.doNotOptimizeAway(new_fn(ty, middle_buffer_a, middle_buffer_b)); | |
const new_end = rdtsc(); | |
if (middle_i > warmup_iterations) middle_new_cycles += (new_end - new_start); | |
} | |
var middle_old_cycles: u64 = 0; | |
clflush(ty, same_buffer_a); | |
clflush(ty, same_buffer_b); | |
var middle_j: u32 = 0; | |
while (middle_j < iterations_per_byte + warmup_iterations) : (middle_j += 1) { | |
const old_start = rdtsc(); | |
std.mem.doNotOptimizeAway(old_fn(ty, middle_buffer_a, middle_buffer_b)); | |
const old_end = rdtsc(); | |
if (middle_j > warmup_iterations) middle_old_cycles += (old_end - old_start); | |
} | |
var start_new_cycles: u64 = 0; | |
clflush(ty, same_buffer_a); | |
clflush(ty, same_buffer_b); | |
var start_i: u32 = 0; | |
while (start_i < iterations_per_byte + warmup_iterations) : (start_i += 1) { | |
const new_start = rdtsc(); | |
std.mem.doNotOptimizeAway(new_fn(ty, start_buffer_a, start_buffer_b)); | |
const new_end = rdtsc(); | |
if (start_i > warmup_iterations) start_new_cycles += (new_end - new_start); | |
} | |
var start_old_cycles: u64 = 0; | |
clflush(ty, same_buffer_a); | |
clflush(ty, same_buffer_b); | |
var start_j: u32 = 0; | |
while (start_j < iterations_per_byte + warmup_iterations) : (start_j += 1) { | |
const old_start = rdtsc(); | |
std.mem.doNotOptimizeAway(old_fn(ty, start_buffer_a, start_buffer_b)); | |
const old_end = rdtsc(); | |
if (start_j > warmup_iterations) start_old_cycles += (old_end - old_start); | |
} | |
const same_new_cycles_per_byte = same_new_cycles / iterations_per_byte; | |
const same_old_cycles_per_byte = same_old_cycles / iterations_per_byte; | |
const middle_new_cycles_per_byte = middle_new_cycles / iterations_per_byte; | |
const middle_old_cycles_per_byte = middle_old_cycles / iterations_per_byte; | |
const start_new_cycles_per_byte = start_new_cycles / iterations_per_byte; | |
const start_old_cycles_per_byte = start_old_cycles / iterations_per_byte; | |
try stdout.writer().print("{},{d},{d},{d},{d},{d},{d}\n", .{ | |
index, | |
same_new_cycles_per_byte, | |
same_old_cycles_per_byte, | |
middle_new_cycles_per_byte, | |
middle_old_cycles_per_byte, | |
start_new_cycles_per_byte, | |
start_old_cycles_per_byte, | |
}); | |
main_progress_node.completeOne(); | |
allocator.free(same_buffer_a); | |
allocator.free(same_buffer_b); | |
} | |
} | |
/// Compares two slices and returns whether they are equal. | |
noinline fn eql(comptime T: type, a: []const T, b: []const T) bool { | |
if (a.len != b.len) return false; | |
if (a.len == 0 or a.ptr == b.ptr) return true; | |
if (T == u8) return eqlBytes(a, b); | |
if (@typeInfo(T) == .Int and std.math.isPowerOfTwo(@bitSizeOf(T))) return eqlBytes(std.mem.sliceAsBytes(a), std.mem.sliceAsBytes(b)); | |
for (a, b) |a_elem, b_elem| { | |
if (a_elem != b_elem) return false; | |
} | |
return true; | |
} | |
/// std.mem.eql heavily optimized for slices of bytes. | |
fn eqlBytes(a: []const u8, b: []const u8) bool { | |
if (a.len != b.len) return false; | |
if (a.len == 0 or a.ptr == b.ptr) return true; | |
if (a.len <= 16) { | |
if (a.len < 4) { | |
const x = (a[0] ^ b[0]) | (a[a.len - 1] ^ b[a.len - 1]) | (a[a.len / 2] ^ b[a.len / 2]); | |
return x == 0; | |
} | |
var x: u32 = 0; | |
for ([_]usize{ 0, a.len - 4, (a.len / 8) * 4, a.len - 4 - ((a.len / 8) * 4) }) |n| { | |
x |= @as(u32, @bitCast(a[n..][0..4].*)) ^ @as(u32, @bitCast(b[n..][0..4].*)); | |
} | |
return x == 0; | |
} | |
const v = std.simd.suggestVectorSize(u8) orelse @sizeOf(usize); | |
std.debug.print("", .{}); | |
inline for (1..6) |s| { | |
const n = 16 << s; | |
if (n <= v and a.len <= n) { | |
const V = @Vector(n / 2, u8); | |
var x = @as(V, a[0 .. n / 2].*) ^ @as(V, b[0 .. n / 2].*); | |
x |= @as(V, a[a.len - n / 2 ..][0 .. n / 2].*) ^ @as(V, b[a.len - n / 2 ..][0 .. n / 2].*); | |
return @reduce(.Or, x) == 0; | |
} | |
} | |
const V = @Vector(v, u8); | |
var p: usize = 0; | |
var l = a.len; | |
inline for ([_]usize{ 4, 1 }) |blk| { | |
while (l > v * blk) : ({ | |
l -= v * blk; | |
p += v * blk; | |
}) { | |
if (blk > 1) { | |
@prefetch(a.ptr + p + v * blk, .{ .locality = 3 }); | |
@prefetch(b.ptr + p + v * blk, .{ .locality = 3 }); | |
} | |
var x: V = @splat(0); | |
for (0..blk) |i| x |= @as(V, a[p + (i * v) ..][0..v].*) ^ @as(V, b[p + (i * v) ..][0..v].*); | |
if (@reduce(.Or, x) != 0) return false; | |
} | |
} | |
return @reduce(.Or, @as(V, a[a.len - v ..][0..v].*) ^ @as(V, b[a.len - v ..][0..v].*)) == 0; | |
} | |
// My original idea. | |
noinline fn old_eql(comptime T: type, a: []const T, b: []const T) bool { | |
if (a.len != b.len) return false; | |
if (a.len == 0 or a.ptr == b.ptr) return true; | |
var result: u8 = 0; | |
for (0..a.len) |i| { | |
result |= a[i] ^ b[i]; | |
} | |
return result == 0; | |
} | |
// Stdlib implementation. | |
noinline fn stdlib_eql(comptime T: type, a: []const T, b: []const T) bool { | |
if (a.len != b.len) return false; | |
if (a.ptr == b.ptr) return true; | |
for (a, b) |a_elem, b_elem| { | |
if (a_elem != b_elem) return false; | |
} | |
return true; | |
} | |
// Wrapper for the assembly version. | |
noinline fn asm_eql2(comptime T: type, a: []const T, b: []const T) bool { | |
return asm_eql(a.ptr, a.len, b.ptr, b.len); | |
} | |
/// X86 cycle counter | |
inline fn rdtsc() usize { | |
var a: u32 = undefined; | |
var b: u32 = undefined; | |
asm volatile ("rdtscp" | |
: [a] "={edx}" (a), | |
[b] "={eax}" (b), | |
: | |
: "ecx" | |
); | |
return (@as(u64, a) << 32) | b; | |
} | |
inline fn clflush(comptime T: type, slice: []const T) void { | |
for (0..(slice.len) / @bitSizeOf(T) * 8) |chunk| { | |
const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8); | |
asm volatile ("clflush %[ptr]" | |
: | |
: [ptr] "m" (offset), | |
: "memory" | |
); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import matplotlib.ticker as tkr | |
import pandas as pd | |
def sizeof_fmt(x, pos): | |
if x<0: | |
return "" | |
for x_unit in ['bytes', 'kB', 'MB', 'GB', 'TB']: | |
if x < 1024: | |
return "%3.0f %s" % (x, x_unit) | |
x /= 1024 | |
# Main function | |
def main(): | |
# Read the file | |
file = open("data.txt", "r") | |
lines = file.readlines() | |
same_data = [] | |
middle_data = [] | |
start_data = [] | |
for i in range(0, len(lines), 1): | |
line = lines[i] | |
line = line.split(",") | |
size = int(line[0]) | |
same_branch_time = int(line[1]) | |
same_acc_time = int(line[2]) | |
same_data.append([size, same_branch_time, same_acc_time]) | |
middle_branch_time = int(line[3]) | |
middle_acc_time = int(line[4]) | |
middle_data.append([size, middle_branch_time, middle_acc_time]) | |
start_branch_time = int(line[5]) | |
start_acc_time = int(line[6]) | |
start_data.append([size, start_branch_time, start_acc_time]) | |
file.close() | |
start_df = pd.DataFrame(start_data, columns=["size", "branch", "acc"]) | |
middle_df = pd.DataFrame(middle_data, columns=["size", "branch", "acc"]) | |
same_df = pd.DataFrame(same_data, columns=["size", "branch", "acc"]) | |
df_to_plot = "plot you want" | |
print(df_to_plot) | |
# Do the same thing, but ignore deviations of more than 20% | |
plt.plot(df_to_plot["size"], df_to_plot["branch"].rolling(100).mean(), label="new eql") | |
plt.plot(df_to_plot["size"], df_to_plot["acc"].rolling(100).mean(), label="stdlib") | |
# Format the y axis to show the units | |
plt.gca().xaxis.set_major_formatter(tkr.FuncFormatter(sizeof_fmt)) | |
# Label the axes | |
plt.xlabel("Size") | |
plt.ylabel("Cycles") | |
plt.title("-title-") | |
plt.legend() | |
# Save the plot | |
plt.savefig("graph.png") | |
plt.show() | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment