Skip to content

Instantly share code, notes, and snippets.

@Rexicon226
Last active June 17, 2024 14:01
Show Gist options
  • Save Rexicon226/cb433c0a8639345e15224fd16cd66140 to your computer and use it in GitHub Desktop.
Save Rexicon226/cb433c0a8639345e15224fd16cd66140 to your computer and use it in GitHub Desktop.
optimized memeql benchmark
// zig build-exe benchmark.zig -OReleaseFast -lc
// ./benchmark 4096
const std = @import("std");
const allocator = std.heap.c_allocator;
const iterations_per_byte = 1000;
const warmup_iterations = 10;
comptime {
asm (@embedFile("./asm_eql.S"));
}
extern fn asm_eql(a: [*]const u8, a_len: usize, b: [*]const u8, b_len: usize) bool;
pub fn main() !void {
// Pin the process to a single core (1)
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
try std.os.linux.sched_setaffinity(0, &cpu0001);
const stdout = std.io.getStdOut();
var progress: std.Progress = .{ .dont_print_on_dumb = true };
const loops = try std.process.argsAlloc(allocator);
defer std.process.argsFree(allocator, loops);
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);
const main_progress_node = progress.start("Progress", max_bytes);
defer main_progress_node.end();
const ty = u8;
for (1..max_bytes) |index| {
const same_buffer_a = try allocator.alloc(ty, index);
@memset(same_buffer_a, 0);
const same_buffer_b = try allocator.alloc(ty, index);
@memset(same_buffer_b, 0);
const middle_buffer_a = try allocator.alloc(ty, index);
@memset(middle_buffer_a, 0);
middle_buffer_a[index / 2] = 1;
const middle_buffer_b = try allocator.alloc(ty, index);
@memset(middle_buffer_b, 0);
const start_buffer_a = try allocator.alloc(ty, index);
@memset(start_buffer_a, 0);
start_buffer_a[0] = 1;
const start_buffer_b = try allocator.alloc(ty, index);
@memset(start_buffer_b, 0);
var same_new_cycles: u64 = 0;
const new_fn = eql;
const old_fn = stdlib_eql;
clflush(ty, same_buffer_a);
clflush(ty, same_buffer_b);
var same_i: u32 = 0;
while (same_i < iterations_per_byte + warmup_iterations) : (same_i += 1) {
const new_start = rdtsc();
std.mem.doNotOptimizeAway(new_fn(ty, same_buffer_a, same_buffer_b));
const new_end = rdtsc();
if (same_i > warmup_iterations) same_new_cycles += (new_end - new_start);
}
var same_old_cycles: u64 = 0;
clflush(ty, same_buffer_a);
clflush(ty, same_buffer_b);
var same_j: u32 = 0;
while (same_j < iterations_per_byte + warmup_iterations) : (same_j += 1) {
const old_start = rdtsc();
std.mem.doNotOptimizeAway(old_fn(ty, same_buffer_a, same_buffer_b));
const old_end = rdtsc();
if (same_j > warmup_iterations) same_old_cycles += (old_end - old_start);
}
var middle_new_cycles: u64 = 0;
clflush(ty, same_buffer_a);
clflush(ty, same_buffer_b);
var middle_i: u32 = 0;
while (middle_i < iterations_per_byte + warmup_iterations) : (middle_i += 1) {
const new_start = rdtsc();
std.mem.doNotOptimizeAway(new_fn(ty, middle_buffer_a, middle_buffer_b));
const new_end = rdtsc();
if (middle_i > warmup_iterations) middle_new_cycles += (new_end - new_start);
}
var middle_old_cycles: u64 = 0;
clflush(ty, same_buffer_a);
clflush(ty, same_buffer_b);
var middle_j: u32 = 0;
while (middle_j < iterations_per_byte + warmup_iterations) : (middle_j += 1) {
const old_start = rdtsc();
std.mem.doNotOptimizeAway(old_fn(ty, middle_buffer_a, middle_buffer_b));
const old_end = rdtsc();
if (middle_j > warmup_iterations) middle_old_cycles += (old_end - old_start);
}
var start_new_cycles: u64 = 0;
clflush(ty, same_buffer_a);
clflush(ty, same_buffer_b);
var start_i: u32 = 0;
while (start_i < iterations_per_byte + warmup_iterations) : (start_i += 1) {
const new_start = rdtsc();
std.mem.doNotOptimizeAway(new_fn(ty, start_buffer_a, start_buffer_b));
const new_end = rdtsc();
if (start_i > warmup_iterations) start_new_cycles += (new_end - new_start);
}
var start_old_cycles: u64 = 0;
clflush(ty, same_buffer_a);
clflush(ty, same_buffer_b);
var start_j: u32 = 0;
while (start_j < iterations_per_byte + warmup_iterations) : (start_j += 1) {
const old_start = rdtsc();
std.mem.doNotOptimizeAway(old_fn(ty, start_buffer_a, start_buffer_b));
const old_end = rdtsc();
if (start_j > warmup_iterations) start_old_cycles += (old_end - old_start);
}
const same_new_cycles_per_byte = same_new_cycles / iterations_per_byte;
const same_old_cycles_per_byte = same_old_cycles / iterations_per_byte;
const middle_new_cycles_per_byte = middle_new_cycles / iterations_per_byte;
const middle_old_cycles_per_byte = middle_old_cycles / iterations_per_byte;
const start_new_cycles_per_byte = start_new_cycles / iterations_per_byte;
const start_old_cycles_per_byte = start_old_cycles / iterations_per_byte;
try stdout.writer().print("{},{d},{d},{d},{d},{d},{d}\n", .{
index,
same_new_cycles_per_byte,
same_old_cycles_per_byte,
middle_new_cycles_per_byte,
middle_old_cycles_per_byte,
start_new_cycles_per_byte,
start_old_cycles_per_byte,
});
main_progress_node.completeOne();
allocator.free(same_buffer_a);
allocator.free(same_buffer_b);
}
}
/// Compares two slices and returns whether they are equal.
noinline fn eql(comptime T: type, a: []const T, b: []const T) bool {
if (a.len != b.len) return false;
if (a.len == 0 or a.ptr == b.ptr) return true;
if (T == u8) return eqlBytes(a, b);
if (@typeInfo(T) == .Int and std.math.isPowerOfTwo(@bitSizeOf(T))) return eqlBytes(std.mem.sliceAsBytes(a), std.mem.sliceAsBytes(b));
for (a, b) |a_elem, b_elem| {
if (a_elem != b_elem) return false;
}
return true;
}
/// std.mem.eql heavily optimized for slices of bytes.
fn eqlBytes(a: []const u8, b: []const u8) bool {
if (a.len != b.len) return false;
if (a.len == 0 or a.ptr == b.ptr) return true;
if (a.len <= 16) {
if (a.len < 4) {
const x = (a[0] ^ b[0]) | (a[a.len - 1] ^ b[a.len - 1]) | (a[a.len / 2] ^ b[a.len / 2]);
return x == 0;
}
var x: u32 = 0;
for ([_]usize{ 0, a.len - 4, (a.len / 8) * 4, a.len - 4 - ((a.len / 8) * 4) }) |n| {
x |= @as(u32, @bitCast(a[n..][0..4].*)) ^ @as(u32, @bitCast(b[n..][0..4].*));
}
return x == 0;
}
const v = std.simd.suggestVectorSize(u8) orelse @sizeOf(usize);
std.debug.print("", .{});
inline for (1..6) |s| {
const n = 16 << s;
if (n <= v and a.len <= n) {
const V = @Vector(n / 2, u8);
var x = @as(V, a[0 .. n / 2].*) ^ @as(V, b[0 .. n / 2].*);
x |= @as(V, a[a.len - n / 2 ..][0 .. n / 2].*) ^ @as(V, b[a.len - n / 2 ..][0 .. n / 2].*);
return @reduce(.Or, x) == 0;
}
}
const V = @Vector(v, u8);
var p: usize = 0;
var l = a.len;
inline for ([_]usize{ 4, 1 }) |blk| {
while (l > v * blk) : ({
l -= v * blk;
p += v * blk;
}) {
if (blk > 1) {
@prefetch(a.ptr + p + v * blk, .{ .locality = 3 });
@prefetch(b.ptr + p + v * blk, .{ .locality = 3 });
}
var x: V = @splat(0);
for (0..blk) |i| x |= @as(V, a[p + (i * v) ..][0..v].*) ^ @as(V, b[p + (i * v) ..][0..v].*);
if (@reduce(.Or, x) != 0) return false;
}
}
return @reduce(.Or, @as(V, a[a.len - v ..][0..v].*) ^ @as(V, b[a.len - v ..][0..v].*)) == 0;
}
// My original idea.
noinline fn old_eql(comptime T: type, a: []const T, b: []const T) bool {
if (a.len != b.len) return false;
if (a.len == 0 or a.ptr == b.ptr) return true;
var result: u8 = 0;
for (0..a.len) |i| {
result |= a[i] ^ b[i];
}
return result == 0;
}
// Stdlib implementation.
noinline fn stdlib_eql(comptime T: type, a: []const T, b: []const T) bool {
if (a.len != b.len) return false;
if (a.ptr == b.ptr) return true;
for (a, b) |a_elem, b_elem| {
if (a_elem != b_elem) return false;
}
return true;
}
// Wrapper for the assembly version.
noinline fn asm_eql2(comptime T: type, a: []const T, b: []const T) bool {
return asm_eql(a.ptr, a.len, b.ptr, b.len);
}
/// X86 cycle counter
inline fn rdtsc() usize {
var a: u32 = undefined;
var b: u32 = undefined;
asm volatile ("rdtscp"
: [a] "={edx}" (a),
[b] "={eax}" (b),
:
: "ecx"
);
return (@as(u64, a) << 32) | b;
}
inline fn clflush(comptime T: type, slice: []const T) void {
for (0..(slice.len) / @bitSizeOf(T) * 8) |chunk| {
const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8);
asm volatile ("clflush %[ptr]"
:
: [ptr] "m" (offset),
: "memory"
);
}
}
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import pandas as pd
def sizeof_fmt(x, pos):
if x<0:
return ""
for x_unit in ['bytes', 'kB', 'MB', 'GB', 'TB']:
if x < 1024:
return "%3.0f %s" % (x, x_unit)
x /= 1024
# Main function
def main():
# Read the file
file = open("data.txt", "r")
lines = file.readlines()
same_data = []
middle_data = []
start_data = []
for i in range(0, len(lines), 1):
line = lines[i]
line = line.split(",")
size = int(line[0])
same_branch_time = int(line[1])
same_acc_time = int(line[2])
same_data.append([size, same_branch_time, same_acc_time])
middle_branch_time = int(line[3])
middle_acc_time = int(line[4])
middle_data.append([size, middle_branch_time, middle_acc_time])
start_branch_time = int(line[5])
start_acc_time = int(line[6])
start_data.append([size, start_branch_time, start_acc_time])
file.close()
start_df = pd.DataFrame(start_data, columns=["size", "branch", "acc"])
middle_df = pd.DataFrame(middle_data, columns=["size", "branch", "acc"])
same_df = pd.DataFrame(same_data, columns=["size", "branch", "acc"])
df_to_plot = "plot you want"
print(df_to_plot)
# Do the same thing, but ignore deviations of more than 20%
plt.plot(df_to_plot["size"], df_to_plot["branch"].rolling(100).mean(), label="new eql")
plt.plot(df_to_plot["size"], df_to_plot["acc"].rolling(100).mean(), label="stdlib")
# Format the y axis to show the units
plt.gca().xaxis.set_major_formatter(tkr.FuncFormatter(sizeof_fmt))
# Label the axes
plt.xlabel("Size")
plt.ylabel("Cycles")
plt.title("-title-")
plt.legend()
# Save the plot
plt.savefig("graph.png")
plt.show()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment