Rexicon226 · June 17, 2024 14:01
diff --git a/benchmark.zig b/benchmark.zig
 // zig build-exe benchmark.zig -OReleaseFast -lc
 // ./benchmark 4096
 const std = @import("std");

 const allocator = std.heap.c_allocator;

 const iterations_per_byte = 1000;
 const warmup_iterations = 10;

 comptime {
    asm (@embedFile("./asm_eql.S"));
 }

 extern fn asm_eql(a: [*]const u8, a_len: usize, b: [*]const u8, b_len: usize) bool;

 pub fn main() !void {
    // Pin the process to a single core (1)
    const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
    try std.os.linux.sched_setaffinity(0, &cpu0001);

    const stdout = std.io.getStdOut();

    var progress: std.Progress = .{ .dont_print_on_dumb = true };

    const loops = try std.process.argsAlloc(allocator);
    defer std.process.argsFree(allocator, loops);

    const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);

    const main_progress_node = progress.start("Progress", max_bytes);
    defer main_progress_node.end();

    const ty = u8;

    for (1..max_bytes) |index| {
        const same_buffer_a = try allocator.alloc(ty, index);
        @memset(same_buffer_a, 0);

        const same_buffer_b = try allocator.alloc(ty, index);
        @memset(same_buffer_b, 0);

        const middle_buffer_a = try allocator.alloc(ty, index);
        @memset(middle_buffer_a, 0);

        middle_buffer_a[index / 2] = 1;

        const middle_buffer_b = try allocator.alloc(ty, index);
        @memset(middle_buffer_b, 0);

        const start_buffer_a = try allocator.alloc(ty, index);
        @memset(start_buffer_a, 0);

        start_buffer_a[0] = 1;

        const start_buffer_b = try allocator.alloc(ty, index);
        @memset(start_buffer_b, 0);

        var same_new_cycles: u64 = 0;

        const new_fn = eql;
        const old_fn = stdlib_eql;

        clflush(ty, same_buffer_a);
        clflush(ty, same_buffer_b);

        var same_i: u32 = 0;
        while (same_i < iterations_per_byte + warmup_iterations) : (same_i += 1) {
            const new_start = rdtsc();

            std.mem.doNotOptimizeAway(new_fn(ty, same_buffer_a, same_buffer_b));

            const new_end = rdtsc();

            if (same_i > warmup_iterations) same_new_cycles += (new_end - new_start);
        }

        var same_old_cycles: u64 = 0;

        clflush(ty, same_buffer_a);
        clflush(ty, same_buffer_b);

        var same_j: u32 = 0;
        while (same_j < iterations_per_byte + warmup_iterations) : (same_j += 1) {
            const old_start = rdtsc();

            std.mem.doNotOptimizeAway(old_fn(ty, same_buffer_a, same_buffer_b));

            const old_end = rdtsc();

            if (same_j > warmup_iterations) same_old_cycles += (old_end - old_start);
        }

        var middle_new_cycles: u64 = 0;

        clflush(ty, same_buffer_a);
        clflush(ty, same_buffer_b);

        var middle_i: u32 = 0;
        while (middle_i < iterations_per_byte + warmup_iterations) : (middle_i += 1) {
            const new_start = rdtsc();

            std.mem.doNotOptimizeAway(new_fn(ty, middle_buffer_a, middle_buffer_b));

            const new_end = rdtsc();

            if (middle_i > warmup_iterations) middle_new_cycles += (new_end - new_start);
        }

        var middle_old_cycles: u64 = 0;

        clflush(ty, same_buffer_a);
        clflush(ty, same_buffer_b);

        var middle_j: u32 = 0;
        while (middle_j < iterations_per_byte + warmup_iterations) : (middle_j += 1) {
            const old_start = rdtsc();

            std.mem.doNotOptimizeAway(old_fn(ty, middle_buffer_a, middle_buffer_b));

            const old_end = rdtsc();

            if (middle_j > warmup_iterations) middle_old_cycles += (old_end - old_start);
        }

        var start_new_cycles: u64 = 0;

        clflush(ty, same_buffer_a);
        clflush(ty, same_buffer_b);

        var start_i: u32 = 0;
        while (start_i < iterations_per_byte + warmup_iterations) : (start_i += 1) {
            const new_start = rdtsc();

            std.mem.doNotOptimizeAway(new_fn(ty, start_buffer_a, start_buffer_b));

            const new_end = rdtsc();

            if (start_i > warmup_iterations) start_new_cycles += (new_end - new_start);
        }

        var start_old_cycles: u64 = 0;

        clflush(ty, same_buffer_a);
        clflush(ty, same_buffer_b);

        var start_j: u32 = 0;
        while (start_j < iterations_per_byte + warmup_iterations) : (start_j += 1) {
            const old_start = rdtsc();

            std.mem.doNotOptimizeAway(old_fn(ty, start_buffer_a, start_buffer_b));

            const old_end = rdtsc();

            if (start_j > warmup_iterations) start_old_cycles += (old_end - old_start);
        }

        const same_new_cycles_per_byte = same_new_cycles / iterations_per_byte;
        const same_old_cycles_per_byte = same_old_cycles / iterations_per_byte;

        const middle_new_cycles_per_byte = middle_new_cycles / iterations_per_byte;
        const middle_old_cycles_per_byte = middle_old_cycles / iterations_per_byte;

        const start_new_cycles_per_byte = start_new_cycles / iterations_per_byte;
        const start_old_cycles_per_byte = start_old_cycles / iterations_per_byte;

        try stdout.writer().print("{},{d},{d},{d},{d},{d},{d}\n", .{
            index,
            same_new_cycles_per_byte,
            same_old_cycles_per_byte,
            middle_new_cycles_per_byte,
            middle_old_cycles_per_byte,
            start_new_cycles_per_byte,
            start_old_cycles_per_byte,
        });

        main_progress_node.completeOne();

        allocator.free(same_buffer_a);
        allocator.free(same_buffer_b);
    }
 }

 /// Compares two slices and returns whether they are equal.
 noinline fn eql(comptime T: type, a: []const T, b: []const T) bool {
    if (a.len != b.len) return false;
    if (a.len == 0 or a.ptr == b.ptr) return true;
    if (T == u8) return eqlBytes(a, b);
    if (@typeInfo(T) == .Int and std.math.isPowerOfTwo(@bitSizeOf(T))) return eqlBytes(std.mem.sliceAsBytes(a), std.mem.sliceAsBytes(b));

    for (a, b) |a_elem, b_elem| {
        if (a_elem != b_elem) return false;
    }
    return true;
 }

 /// std.mem.eql heavily optimized for slices of bytes.
 fn eqlBytes(a: []const u8, b: []const u8) bool {
    if (a.len != b.len) return false;
    if (a.len == 0 or a.ptr == b.ptr) return true;

    if (a.len <= 16) {
        if (a.len < 4) {
            const x = (a[0] ^ b[0]) | (a[a.len - 1] ^ b[a.len - 1]) | (a[a.len / 2] ^ b[a.len / 2]);
            return x == 0;
        }
        var x: u32 = 0;
        for ([_]usize{ 0, a.len - 4, (a.len / 8) * 4, a.len - 4 - ((a.len / 8) * 4) }) |n| {
            x |= @as(u32, @bitCast(a[n..][0..4].*)) ^ @as(u32, @bitCast(b[n..][0..4].*));
        }
        return x == 0;
    }

    const v = std.simd.suggestVectorSize(u8) orelse @sizeOf(usize);
    std.debug.print("", .{});

    inline for (1..6) |s| {
        const n = 16 << s;
        if (n <= v and a.len <= n) {
            const V = @Vector(n / 2, u8);
            var x = @as(V, a[0 .. n / 2].*) ^ @as(V, b[0 .. n / 2].*);
            x |= @as(V, a[a.len - n / 2 ..][0 .. n / 2].*) ^ @as(V, b[a.len - n / 2 ..][0 .. n / 2].*);
            return @reduce(.Or, x) == 0;
        }
    }

    const V = @Vector(v, u8);

    var p: usize = 0;
    var l = a.len;
    inline for ([_]usize{ 4, 1 }) |blk| {
        while (l > v * blk) : ({
            l -= v * blk;
            p += v * blk;
        }) {
            if (blk > 1) {
                @prefetch(a.ptr + p + v * blk, .{ .locality = 3 });
                @prefetch(b.ptr + p + v * blk, .{ .locality = 3 });
            }
            var x: V = @splat(0);
            for (0..blk) |i| x |= @as(V, a[p + (i * v) ..][0..v].*) ^ @as(V, b[p + (i * v) ..][0..v].*);
            if (@reduce(.Or, x) != 0) return false;
        }
    }
    return @reduce(.Or, @as(V, a[a.len - v ..][0..v].*) ^ @as(V, b[a.len - v ..][0..v].*)) == 0;
 }

 // My original idea.
 noinline fn old_eql(comptime T: type, a: []const T, b: []const T) bool {
    if (a.len != b.len) return false;
    if (a.len == 0 or a.ptr == b.ptr) return true;

    var result: u8 = 0;

    for (0..a.len) |i| {
        result |= a[i] ^ b[i];
    }

    return result == 0;
 }

 // Stdlib implementation.
 noinline fn stdlib_eql(comptime T: type, a: []const T, b: []const T) bool {
    if (a.len != b.len) return false;
    if (a.ptr == b.ptr) return true;
    for (a, b) |a_elem, b_elem| {
        if (a_elem != b_elem) return false;
    }
    return true;
 }

 // Wrapper for the assembly version.
 noinline fn asm_eql2(comptime T: type, a: []const T, b: []const T) bool {
    return asm_eql(a.ptr, a.len, b.ptr, b.len);
 }

 /// X86 cycle counter
 inline fn rdtsc() usize {
    var a: u32 = undefined;
    var b: u32 = undefined;
    asm volatile ("rdtscp"
        : [a] "={edx}" (a),
          [b] "={eax}" (b),
        :
        : "ecx"
    );
    return (@as(u64, a) << 32) | b;
 }

 inline fn clflush(comptime T: type, slice: []const T) void {
    for (0..(slice.len) / @bitSizeOf(T) * 8) |chunk| {
        const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8);
        asm volatile ("clflush %[ptr]"
            :
            : [ptr] "m" (offset),
            : "memory"
        );
    }
 }
diff --git a/graph.py b/graph.py
 import matplotlib.pyplot as plt
 import matplotlib.ticker as tkr  
 import pandas as pd

 def sizeof_fmt(x, pos):
    if x<0:
        return ""
    for x_unit in ['bytes', 'kB', 'MB', 'GB', 'TB']:
        if x < 1024:
            return "%3.0f %s" % (x, x_unit)
        x /= 1024

 # Main function
 def main():
    # Read the file
    file = open("data.txt", "r")
    lines = file.readlines()

    same_data = []
    middle_data = []
    start_data = []

    for i in range(0, len(lines), 1):
        line = lines[i]
        line = line.split(",")

        size = int(line[0])

        same_branch_time = int(line[1])
        same_acc_time = int(line[2])

        same_data.append([size, same_branch_time, same_acc_time])

        middle_branch_time = int(line[3])
        middle_acc_time = int(line[4])

        middle_data.append([size, middle_branch_time, middle_acc_time])

        start_branch_time = int(line[5])
        start_acc_time = int(line[6])

        start_data.append([size, start_branch_time, start_acc_time])

    file.close()

    start_df = pd.DataFrame(start_data, columns=["size", "branch", "acc"])
    middle_df = pd.DataFrame(middle_data, columns=["size", "branch", "acc"])
    same_df = pd.DataFrame(same_data, columns=["size", "branch", "acc"])

    df_to_plot = "plot you want"

    print(df_to_plot)

    # Do the same thing, but ignore deviations of more than 20%
    plt.plot(df_to_plot["size"], df_to_plot["branch"].rolling(100).mean(), label="new eql")
    plt.plot(df_to_plot["size"], df_to_plot["acc"].rolling(100).mean(), label="stdlib")

    # Format the y axis to show the units
    plt.gca().xaxis.set_major_formatter(tkr.FuncFormatter(sizeof_fmt))

    # Label the axes
    plt.xlabel("Size")
    plt.ylabel("Cycles")

    plt.title("-title-")

    plt.legend()

    # Save the plot
    plt.savefig("graph.png")

    plt.show()



 main()
	// zig build-exe benchmark.zig -OReleaseFast -lc
	// ./benchmark 4096
	const std = @import("std");

	const allocator = std.heap.c_allocator;

	const iterations_per_byte = 1000;
	const warmup_iterations = 10;

	comptime {
	asm (@embedFile("./asm_eql.S"));
	}

	extern fn asm_eql(a: []const u8, a_len: usize, b: []const u8, b_len: usize) bool;

	pub fn main() !void {
	// Pin the process to a single core (1)
	const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
	try std.os.linux.sched_setaffinity(0, &cpu0001);

	const stdout = std.io.getStdOut();

	var progress: std.Progress = .{ .dont_print_on_dumb = true };

	const loops = try std.process.argsAlloc(allocator);
	defer std.process.argsFree(allocator, loops);

	const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);

	const main_progress_node = progress.start("Progress", max_bytes);
	defer main_progress_node.end();

	const ty = u8;

	for (1..max_bytes) \|index\| {
	const same_buffer_a = try allocator.alloc(ty, index);
	@memset(same_buffer_a, 0);

	const same_buffer_b = try allocator.alloc(ty, index);
	@memset(same_buffer_b, 0);

	const middle_buffer_a = try allocator.alloc(ty, index);
	@memset(middle_buffer_a, 0);

	middle_buffer_a[index / 2] = 1;

	const middle_buffer_b = try allocator.alloc(ty, index);
	@memset(middle_buffer_b, 0);

	const start_buffer_a = try allocator.alloc(ty, index);
	@memset(start_buffer_a, 0);

	start_buffer_a[0] = 1;

	const start_buffer_b = try allocator.alloc(ty, index);
	@memset(start_buffer_b, 0);

	var same_new_cycles: u64 = 0;

	const new_fn = eql;
	const old_fn = stdlib_eql;

	clflush(ty, same_buffer_a);
	clflush(ty, same_buffer_b);

	var same_i: u32 = 0;
	while (same_i < iterations_per_byte + warmup_iterations) : (same_i += 1) {
	const new_start = rdtsc();

	std.mem.doNotOptimizeAway(new_fn(ty, same_buffer_a, same_buffer_b));

	const new_end = rdtsc();

	if (same_i > warmup_iterations) same_new_cycles += (new_end - new_start);
	}

	var same_old_cycles: u64 = 0;

	clflush(ty, same_buffer_a);
	clflush(ty, same_buffer_b);

	var same_j: u32 = 0;
	while (same_j < iterations_per_byte + warmup_iterations) : (same_j += 1) {
	const old_start = rdtsc();

	std.mem.doNotOptimizeAway(old_fn(ty, same_buffer_a, same_buffer_b));

	const old_end = rdtsc();

	if (same_j > warmup_iterations) same_old_cycles += (old_end - old_start);
	}

	var middle_new_cycles: u64 = 0;

	clflush(ty, same_buffer_a);
	clflush(ty, same_buffer_b);

	var middle_i: u32 = 0;
	while (middle_i < iterations_per_byte + warmup_iterations) : (middle_i += 1) {
	const new_start = rdtsc();

	std.mem.doNotOptimizeAway(new_fn(ty, middle_buffer_a, middle_buffer_b));

	const new_end = rdtsc();

	if (middle_i > warmup_iterations) middle_new_cycles += (new_end - new_start);
	}

	var middle_old_cycles: u64 = 0;

	clflush(ty, same_buffer_a);
	clflush(ty, same_buffer_b);

	var middle_j: u32 = 0;
	while (middle_j < iterations_per_byte + warmup_iterations) : (middle_j += 1) {
	const old_start = rdtsc();

	std.mem.doNotOptimizeAway(old_fn(ty, middle_buffer_a, middle_buffer_b));

	const old_end = rdtsc();

	if (middle_j > warmup_iterations) middle_old_cycles += (old_end - old_start);
	}

	var start_new_cycles: u64 = 0;

	clflush(ty, same_buffer_a);
	clflush(ty, same_buffer_b);

	var start_i: u32 = 0;
	while (start_i < iterations_per_byte + warmup_iterations) : (start_i += 1) {
	const new_start = rdtsc();

	std.mem.doNotOptimizeAway(new_fn(ty, start_buffer_a, start_buffer_b));

	const new_end = rdtsc();

	if (start_i > warmup_iterations) start_new_cycles += (new_end - new_start);
	}

	var start_old_cycles: u64 = 0;

	clflush(ty, same_buffer_a);
	clflush(ty, same_buffer_b);

	var start_j: u32 = 0;
	while (start_j < iterations_per_byte + warmup_iterations) : (start_j += 1) {
	const old_start = rdtsc();

	std.mem.doNotOptimizeAway(old_fn(ty, start_buffer_a, start_buffer_b));

	const old_end = rdtsc();

	if (start_j > warmup_iterations) start_old_cycles += (old_end - old_start);
	}

	const same_new_cycles_per_byte = same_new_cycles / iterations_per_byte;
	const same_old_cycles_per_byte = same_old_cycles / iterations_per_byte;

	const middle_new_cycles_per_byte = middle_new_cycles / iterations_per_byte;
	const middle_old_cycles_per_byte = middle_old_cycles / iterations_per_byte;

	const start_new_cycles_per_byte = start_new_cycles / iterations_per_byte;
	const start_old_cycles_per_byte = start_old_cycles / iterations_per_byte;

	try stdout.writer().print("{},{d},{d},{d},{d},{d},{d}\n", .{
	index,
	same_new_cycles_per_byte,
	same_old_cycles_per_byte,
	middle_new_cycles_per_byte,
	middle_old_cycles_per_byte,
	start_new_cycles_per_byte,
	start_old_cycles_per_byte,
	});

	main_progress_node.completeOne();

	allocator.free(same_buffer_a);
	allocator.free(same_buffer_b);
	}
	}

	/// Compares two slices and returns whether they are equal.
	noinline fn eql(comptime T: type, a: []const T, b: []const T) bool {
	if (a.len != b.len) return false;
	if (a.len == 0 or a.ptr == b.ptr) return true;
	if (T == u8) return eqlBytes(a, b);
	if (@typeInfo(T) == .Int and std.math.isPowerOfTwo(@bitSizeOf(T))) return eqlBytes(std.mem.sliceAsBytes(a), std.mem.sliceAsBytes(b));

	for (a, b) \|a_elem, b_elem\| {
	if (a_elem != b_elem) return false;
	}
	return true;
	}

	/// std.mem.eql heavily optimized for slices of bytes.
	fn eqlBytes(a: []const u8, b: []const u8) bool {
	if (a.len != b.len) return false;
	if (a.len == 0 or a.ptr == b.ptr) return true;

	if (a.len <= 16) {
	if (a.len < 4) {
	const x = (a[0] ^ b[0]) \| (a[a.len - 1] ^ b[a.len - 1]) \| (a[a.len / 2] ^ b[a.len / 2]);
	return x == 0;
	}
	var x: u32 = 0;
	for ([_]usize{ 0, a.len - 4, (a.len / 8) * 4, a.len - 4 - ((a.len / 8) * 4) }) \|n\| {
	x \|= @as(u32, @bitCast(a[n..][0..4].)) ^ @as(u32, @bitCast(b[n..][0..4].));
	}
	return x == 0;
	}

	const v = std.simd.suggestVectorSize(u8) orelse @sizeOf(usize);
	std.debug.print("", .{});

	inline for (1..6) \|s\| {
	const n = 16 << s;
	if (n <= v and a.len <= n) {
	const V = @Vector(n / 2, u8);
	var x = @as(V, a[0 .. n / 2].) ^ @as(V, b[0 .. n / 2].);
	x \|= @as(V, a[a.len - n / 2 ..][0 .. n / 2].) ^ @as(V, b[a.len - n / 2 ..][0 .. n / 2].);
	return @reduce(.Or, x) == 0;
	}
	}

	const V = @Vector(v, u8);

	var p: usize = 0;
	var l = a.len;
	inline for ([_]usize{ 4, 1 }) \|blk\| {
	while (l > v * blk) : ({
	l -= v * blk;
	p += v * blk;
	}) {
	if (blk > 1) {
	@prefetch(a.ptr + p + v * blk, .{ .locality = 3 });
	@prefetch(b.ptr + p + v * blk, .{ .locality = 3 });
	}
	var x: V = @splat(0);
	for (0..blk) \|i\| x \|= @as(V, a[p + (i * v) ..][0..v].) ^ @as(V, b[p + (i v) ..][0..v].*);
	if (@reduce(.Or, x) != 0) return false;
	}
	}
	return @reduce(.Or, @as(V, a[a.len - v ..][0..v].) ^ @as(V, b[a.len - v ..][0..v].)) == 0;
	}

	// My original idea.
	noinline fn old_eql(comptime T: type, a: []const T, b: []const T) bool {
	if (a.len != b.len) return false;
	if (a.len == 0 or a.ptr == b.ptr) return true;

	var result: u8 = 0;

	for (0..a.len) \|i\| {
	result \|= a[i] ^ b[i];
	}

	return result == 0;
	}

	// Stdlib implementation.
	noinline fn stdlib_eql(comptime T: type, a: []const T, b: []const T) bool {
	if (a.len != b.len) return false;
	if (a.ptr == b.ptr) return true;
	for (a, b) \|a_elem, b_elem\| {
	if (a_elem != b_elem) return false;
	}
	return true;
	}

	// Wrapper for the assembly version.
	noinline fn asm_eql2(comptime T: type, a: []const T, b: []const T) bool {
	return asm_eql(a.ptr, a.len, b.ptr, b.len);
	}

	/// X86 cycle counter
	inline fn rdtsc() usize {
	var a: u32 = undefined;
	var b: u32 = undefined;
	asm volatile ("rdtscp"
	: [a] "={edx}" (a),
	[b] "={eax}" (b),
	:
	: "ecx"
	);
	return (@as(u64, a) << 32) \| b;
	}

	inline fn clflush(comptime T: type, slice: []const T) void {
	for (0..(slice.len) / @bitSizeOf(T) * 8) \|chunk\| {
	const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8);
	asm volatile ("clflush %[ptr]"
	:
	: [ptr] "m" (offset),
	: "memory"
	);
	}
	}
	import matplotlib.pyplot as plt
	import matplotlib.ticker as tkr
	import pandas as pd

	def sizeof_fmt(x, pos):
	if x<0:
	return ""
	for x_unit in ['bytes', 'kB', 'MB', 'GB', 'TB']:
	if x < 1024:
	return "%3.0f %s" % (x, x_unit)
	x /= 1024

	# Main function
	def main():
	# Read the file
	file = open("data.txt", "r")
	lines = file.readlines()

	same_data = []
	middle_data = []
	start_data = []

	for i in range(0, len(lines), 1):
	line = lines[i]
	line = line.split(",")

	size = int(line[0])

	same_branch_time = int(line[1])
	same_acc_time = int(line[2])

	same_data.append([size, same_branch_time, same_acc_time])

	middle_branch_time = int(line[3])
	middle_acc_time = int(line[4])

	middle_data.append([size, middle_branch_time, middle_acc_time])

	start_branch_time = int(line[5])
	start_acc_time = int(line[6])

	start_data.append([size, start_branch_time, start_acc_time])

	file.close()

	start_df = pd.DataFrame(start_data, columns=["size", "branch", "acc"])
	middle_df = pd.DataFrame(middle_data, columns=["size", "branch", "acc"])
	same_df = pd.DataFrame(same_data, columns=["size", "branch", "acc"])

	df_to_plot = "plot you want"

	print(df_to_plot)

	# Do the same thing, but ignore deviations of more than 20%
	plt.plot(df_to_plot["size"], df_to_plot["branch"].rolling(100).mean(), label="new eql")
	plt.plot(df_to_plot["size"], df_to_plot["acc"].rolling(100).mean(), label="stdlib")

	# Format the y axis to show the units
	plt.gca().xaxis.set_major_formatter(tkr.FuncFormatter(sizeof_fmt))

	# Label the axes
	plt.xlabel("Size")
	plt.ylabel("Cycles")

	plt.title("-title-")

	plt.legend()

	# Save the plot
	plt.savefig("graph.png")

	plt.show()



	main()