Rexicon226 · August 28, 2024 01:10
diff --git a/benchmark.zig b/benchmark.zig
 const std = @import("std");

 const iterations_per_byte = 1000;
 const warmup_iterations = 10;

 pub fn main() !void {
    // Pin the process to a single core (1)
    const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
    try std.os.linux.sched_setaffinity(0, &cpu0001);

    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();

    const allocator = arena.allocator();

    const stdout = std.io.getStdOut().writer();

    const loops = try std.process.argsAlloc(allocator);
    defer std.process.argsFree(allocator, loops);

    const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);

    const T = u8;

    for (1..max_bytes) |index| {
        const buffer_a = try allocator.alloc(T, index);
        @memset(buffer_a, 0);

        const buffer_b = try allocator.alloc(T, index);
        @memset(buffer_a, 0xAA);

        try stdout.print("{},", .{index});

        inline for (.{
            folly_memcpy,
            rt_memcpy,
            protty_memcpy,
            oak_memcpy,
        }) |func| {
            clflush(T, buffer_a);
            clflush(T, buffer_b);

            var cycles: u64 = 0;
            var i: u32 = 0;
            while (i < iterations_per_byte + warmup_iterations) : (i += 1) {
                const start = rdtsc();
                std.mem.doNotOptimizeAway(func(buffer_a.ptr, buffer_b.ptr, index));
                const end = rdtsc();
                if (i > warmup_iterations) cycles += (end - start);
            }

            try stdout.print("{},", .{cycles / iterations_per_byte});
        }

        try stdout.print("\n", .{});

        allocator.free(buffer_a);
        allocator.free(buffer_b);
    }
 }

 export fn folly_memcpy(maybe_dest: ?[*]u8, maybe_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
    if (len == 0) {
        @branchHint(.unlikely);
        return maybe_dest;
    }

    const dest = maybe_dest.?;
    const src = maybe_src.?;

    if (len < 8) {
        @branchHint(.unlikely);
        if (len == 1) {
            @branchHint(.unlikely);
            dest[0] = src[0];
        } else if (len >= 4) {
            @branchHint(.unlikely);
            blockCopy(dest, src, 4, len);
        } else {
            blockCopy(dest, src, 2, len);
        }
        return dest;
    }

    if (len > 32) {
        @branchHint(.unlikely);
        if (len > 256) {
            @branchHint(.unlikely);
            copyMove(dest, src, len);
            return dest;
        }
        copyLong(dest, src, len);
        return dest;
    }

    if (len > 16) {
        @branchHint(.unlikely);
        blockCopy(dest, src, 16, len);
        return dest;
    }

    blockCopy(dest, src, 8, len);

    return dest;
 }

 inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void {
    const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*;
    const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*;
    dest[0..block_size].* = first;
    dest[len - block_size ..][0..block_size].* = second;
 }

 inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void {
    var array: [8]@Vector(32, u8) = undefined;

    inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| {
        array[i * 2] = src[(N / 2) - 32 ..][0..32].*;
        array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*;

        if (len <= N) {
            @branchHint(.unlikely);
            for (0..i + 1) |j| {
                dest[j * 32 ..][0..32].* = array[j * 2];
                dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1];
            }
            return;
        }
    }
 }

 inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void {
    if (@intFromPtr(src) >= @intFromPtr(dest)) {
        @branchHint(.unlikely);
        copyForward(dest, src, len);
    } else if (@intFromPtr(src) + len > @intFromPtr(dest)) {
        @branchHint(.unlikely);
        overlapBwd(dest, src, len);
    } else {
        copyForward(dest, src, len);
    }
 }

 inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void {
    const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*;

    const N: usize = len & ~@as(usize, 127);
    var i: usize = 0;

    while (i < N) : (i += 128) {
        dest[i..][0..32].* = src[i..][0..32].*;
        dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*;
        dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*;
        dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*;
    }

    if (len - i <= 32) {
        @branchHint(.unlikely);
        dest[len - 32 ..][0..32].* = tail;
    } else {
        copyLong(dest[i..], src[i..], len - i);
    }
 }

 inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void {
    var array: [5]@Vector(32, u8) = undefined;
    array[0] = src[len - 32 ..][0..32].*;
    inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*;

    const end: usize = (@intFromPtr(dest) + len - 32) & 31;
    const range = len - end;
    var s = src + range;
    var d = dest + range;

    while (@intFromPtr(s) > @intFromPtr(src + 128)) {
        // zig fmt: off
        const first  = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*;
        const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*;
        const third  = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*;
        const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*;

        @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).*  = first;
        @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).*  = second;
        @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).*  = third;
        @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth;
        // zig fmt: on

        s -= 128;
        d -= 128;
    }

    inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec;
    dest[len - 32 ..][0..32].* = array[0];
 }

 const rt_memcpy = @extern(
    *const fn (noalias ?[*]u8, noalias ?[*]const u8, len: usize) callconv(.C) ?[*]u8,
    .{ .name = "memcpy", .linkage = .strong },
 );

 export fn protty_memcpy(noalias dst: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
    @setRuntimeSafety(false);

    if (len <= 16) {
        if (len < 4) {
            // handles sizes under a u32.
            if (len > 0) {
                copy_blocks(dst, src, 1, [_]usize{ 0, len / 2, len - 1 });
            }
        } else {
            // handles sizes from u32 to Vector(16).
            const mid = (len / 8) * 4;
            copy_blocks(dst, src, 4, [_]usize{ 0, mid, len - 4, len - 4 - mid });
        }
    } else if (len <= 64) {
        // handles sizes from Vector(16) to cache_line.
        const mid = (len / 32) * 16;
        copy_blocks(dst, src, 16, [_]usize{ 0, mid, len - 16, len - 16 - mid });
    } else {
        // handles sizes over a cache_line.
        for (0..(len - 1) / 128) |i| {
            std.mem.doNotOptimizeAway(i); // prevent LLVM from bloating loop via auto-vectorization.
            copy_blocks(dst, src, 128, [_]usize{i * 128});
        }
        copy_blocks(dst, src, 64, [_]usize{ len -| 128, len - 64 });
    }

    return dst;
 }

 inline fn copy_blocks(dst: ?[*]u8, src: ?[*]const u8, comptime size: usize, offsets: anytype) void {
    var blocks: [offsets.len]@Vector(size, u8) = undefined; // Vector over [N]u8 for good codegen.
    for (offsets, &blocks) |offset, *block| block.* = src.?[offset..][0..size].*;
    for (offsets, blocks) |offset, block| dst.?[offset..][0..size].* = block;
 }

 const CopyType = if (std.simd.suggestVectorLength(u8)) |vec_size|
    @Type(.{ .Vector = .{
        .child = u8,
        .len = vec_size,
    } })
 else
    usize;

 export fn oak_memcpy(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
    if (len <= 16) {
        if (len <= 4) {
            if (len <= 2) {
                if (len == 0) return dest;
                memcpy_range2(1, dest.?, src.?, len);
            } else {
                memcpy_range2(2, dest.?, src.?, len);
            }
        } else if (len <= 8) {
            memcpy_range2(4, dest.?, src.?, len);
        } else {
            memcpy_range2(8, dest.?, src.?, len);
        }
        return dest;
    }

    const unroll_count = 2;
    const alignment = @alignOf(CopyType);
    const size = @sizeOf(CopyType);

    if (comptime 5 <= std.math.log2(unroll_count * size)) {
        inline for (5..std.math.log2(unroll_count * size) + 1) |p| {
            const limit = 1 << p;
            if (len <= limit) {
                memcpy_range2(limit / 2, dest.?, src.?, len);
                return dest;
            }
        }
    }

    std.debug.assert(unroll_count * size < len);

    // we know that `len > 2 * size` and `size >= alignment`
    // so we can safely align `s` to `alignment`
    dest.?[0..size].* = src.?[0..size].*;
    const alignment_offset = alignment - @intFromPtr(src.?) % alignment;
    const n = len - alignment_offset;
    const d = dest.? + alignment_offset;
    const s = src.? + alignment_offset;

    if (@intFromPtr(d) % alignment == 0) {
        memcpy_aligned(@alignCast(@ptrCast(d)), @alignCast(@ptrCast(s)), n, unroll_count);
    } else {
        memcpy_unaligned(@ptrCast(d), @alignCast(@ptrCast(s)), n, unroll_count);
    }

    dest.?[len - size ..][0..size].* = src.?[len - size ..][0..size].*;

    return dest;
 }

 // inline is needed to prevent llvm making an infinitely recursive call to memcpy
 inline fn memcpy_aligned(
    noalias dest: [*]CopyType,
    noalias src: [*]const CopyType,
    max_bytes: usize,
    comptime unroll_count: comptime_int,
 ) void {
    memcpy_blocks(dest, src, max_bytes, unroll_count);
 }

 inline fn memcpy_unaligned(
    noalias dest: [*]align(1) CopyType,
    noalias src: [*]const CopyType,
    max_bytes: usize,
    comptime unroll_count: comptime_int,
 ) void {
    memcpy_blocks(dest, src, max_bytes, unroll_count);
 }

 /// Copies a multiple of `@sizeOf(T)` bytes from `src` to `dest`, where `T` is
 /// the child type of `src` and `dest`. No more than `max_bytes` will be copied
 /// (`max_bytes` need not be a multiple of `@sizeOf(T)`) but `max_bytes` must
 /// be at least `@sizeOf(T)`.
 inline fn memcpy_blocks(
    noalias dest: anytype,
    noalias src: anytype,
    max_bytes: usize,
    comptime unroll_count: comptime_int,
 ) void {
    comptime std.debug.assert(unroll_count > 0);

    const T = @typeInfo(@TypeOf(dest)).Pointer.child;
    comptime std.debug.assert(T == @typeInfo(@TypeOf(src)).Pointer.child);

    const loop_count = max_bytes / (@sizeOf(T) * unroll_count);

    for (0..loop_count) |i| {
        inline for (dest[i * unroll_count ..][0..unroll_count], src[i * unroll_count ..][0..unroll_count]) |*d, s| {
            d.* = s;
        }
    }

    const tail_start = (max_bytes / @sizeOf(T)) - (unroll_count - 1);
    inline for (dest[tail_start..][0 .. unroll_count - 1], src[tail_start..][0 .. unroll_count - 1]) |*d, s| {
        d.* = s;
    }
 }

 /// copy blocks of length `copy_len` from `src[0..len] to `dest[0..len]` at the
 /// start and end of those respective slices
 inline fn memcpy_range2(
    comptime copy_len: comptime_int,
    noalias dest: [*]u8,
    noalias src: [*]const u8,
    len: usize,
 ) void {
    comptime std.debug.assert(std.math.isPowerOfTwo(copy_len));
    const size = @sizeOf(CopyType);

    const last = len - copy_len;
    if (copy_len > size) { // comptime-known
        // we do these copies 1 CopyType at a time to prevent llvm turning this into a call to memcpy
        const d: [*]align(1) CopyType = @ptrCast(dest);
        const s: [*]align(1) const CopyType = @ptrCast(src);
        const count = @divExact(copy_len, size);
        inline for (d[0..count], s[0..count]) |*r, v| {
            r.* = v;
        }
        const dl: [*]align(1) CopyType = @ptrCast(dest + last);
        const sl: [*]align(1) const CopyType = @ptrCast(src + last);
        inline for (dl[0..count], sl[0..count]) |*r, v| {
            r.* = v;
        }
    } else {
        dest[0..copy_len].* = src[0..copy_len].*;
        dest[last..][0..copy_len].* = src[last..][0..copy_len].*;
    }
 }

 /// X86 cycle counter
 inline fn rdtsc() usize {
    var a: u32 = undefined;
    var b: u32 = undefined;
    asm volatile ("rdtscp"
        : [a] "={edx}" (a),
          [b] "={eax}" (b),
        :
        : "ecx"
    );
    return (@as(u64, a) << 32) | b;
 }

 inline fn clflush(comptime T: type, slice: []const T) void {
    for (0..(slice.len) / @bitSizeOf(T) * 8) |chunk| {
        const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8);
        asm volatile ("clflush %[ptr]"
            :
            : [ptr] "m" (offset),
            : "memory"
        );
    }
 }
	const std = @import("std");

	const iterations_per_byte = 1000;
	const warmup_iterations = 10;

	pub fn main() !void {
	// Pin the process to a single core (1)
	const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
	try std.os.linux.sched_setaffinity(0, &cpu0001);

	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
	defer arena.deinit();

	const allocator = arena.allocator();

	const stdout = std.io.getStdOut().writer();

	const loops = try std.process.argsAlloc(allocator);
	defer std.process.argsFree(allocator, loops);

	const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);

	const T = u8;

	for (1..max_bytes) \|index\| {
	const buffer_a = try allocator.alloc(T, index);
	@memset(buffer_a, 0);

	const buffer_b = try allocator.alloc(T, index);
	@memset(buffer_a, 0xAA);

	try stdout.print("{},", .{index});

	inline for (.{
	folly_memcpy,
	rt_memcpy,
	protty_memcpy,
	oak_memcpy,
	}) \|func\| {
	clflush(T, buffer_a);
	clflush(T, buffer_b);

	var cycles: u64 = 0;
	var i: u32 = 0;
	while (i < iterations_per_byte + warmup_iterations) : (i += 1) {
	const start = rdtsc();
	std.mem.doNotOptimizeAway(func(buffer_a.ptr, buffer_b.ptr, index));
	const end = rdtsc();
	if (i > warmup_iterations) cycles += (end - start);
	}

	try stdout.print("{},", .{cycles / iterations_per_byte});
	}

	try stdout.print("\n", .{});

	allocator.free(buffer_a);
	allocator.free(buffer_b);
	}
	}

	export fn folly_memcpy(maybe_dest: ?[]u8, maybe_src: ?[]const u8, len: usize) callconv(.C) ?[*]u8 {
	if (len == 0) {
	@branchHint(.unlikely);
	return maybe_dest;
	}

	const dest = maybe_dest.?;
	const src = maybe_src.?;

	if (len < 8) {
	@branchHint(.unlikely);
	if (len == 1) {
	@branchHint(.unlikely);
	dest[0] = src[0];
	} else if (len >= 4) {
	@branchHint(.unlikely);
	blockCopy(dest, src, 4, len);
	} else {
	blockCopy(dest, src, 2, len);
	}
	return dest;
	}

	if (len > 32) {
	@branchHint(.unlikely);
	if (len > 256) {
	@branchHint(.unlikely);
	copyMove(dest, src, len);
	return dest;
	}
	copyLong(dest, src, len);
	return dest;
	}

	if (len > 16) {
	@branchHint(.unlikely);
	blockCopy(dest, src, 16, len);
	return dest;
	}

	blockCopy(dest, src, 8, len);

	return dest;
	}

	inline fn blockCopy(dest: []u8, src: []const u8, block_size: comptime_int, len: usize) void {
	const first = @as(align(1) const @Vector(block_size, u8), src[0..block_size]).;
	const second = @as(align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).;
	dest[0..block_size].* = first;
	dest[len - block_size ..][0..block_size].* = second;
	}

	inline fn copyLong(dest: []u8, src: []const u8, len: usize) void {
	var array: [8]@Vector(32, u8) = undefined;

	inline for (.{ 64, 128, 192, 256 }, 0..) \|N, i\| {
	array[i * 2] = src[(N / 2) - 32 ..][0..32].*;
	array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*;

	if (len <= N) {
	@branchHint(.unlikely);
	for (0..i + 1) \|j\| {
	dest[j * 32 ..][0..32].* = array[j * 2];
	dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1];
	}
	return;
	}
	}
	}

	inline fn copyMove(dest: []u8, src: []const u8, len: usize) void {
	if (@intFromPtr(src) >= @intFromPtr(dest)) {
	@branchHint(.unlikely);
	copyForward(dest, src, len);
	} else if (@intFromPtr(src) + len > @intFromPtr(dest)) {
	@branchHint(.unlikely);
	overlapBwd(dest, src, len);
	} else {
	copyForward(dest, src, len);
	}
	}

	inline fn copyForward(dest: []u8, src: []const u8, len: usize) void {
	const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*;

	const N: usize = len & ~@as(usize, 127);
	var i: usize = 0;

	while (i < N) : (i += 128) {
	dest[i..][0..32].* = src[i..][0..32].*;
	dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*;
	dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*;
	dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*;
	}

	if (len - i <= 32) {
	@branchHint(.unlikely);
	dest[len - 32 ..][0..32].* = tail;
	} else {
	copyLong(dest[i..], src[i..], len - i);
	}
	}

	inline fn overlapBwd(dest: []u8, src: []const u8, len: usize) void {
	var array: [5]@Vector(32, u8) = undefined;
	array[0] = src[len - 32 ..][0..32].*;
	inline for (1..5) \|i\| array[i] = src[(i - 1) << 5 ..][0..32].*;

	const end: usize = (@intFromPtr(dest) + len - 32) & 31;
	const range = len - end;
	var s = src + range;
	var d = dest + range;

	while (@intFromPtr(s) > @intFromPtr(src + 128)) {
	// zig fmt: off
	const first = @as(align(1) const @Vector(32, u8), @ptrCast(s - 32)).;
	const second = @as(align(1) const @Vector(32, u8), @ptrCast(s - 64)).;
	const third = @as(align(1) const @Vector(32, u8), @ptrCast(s - 96)).;
	const fourth = @as(align(1) const @Vector(32, u8), @ptrCast(s - 128)).;

	@as(align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))). = first;
	@as(align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))). = second;
	@as(align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))). = third;
	@as(align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))). = fourth;
	// zig fmt: on

	s -= 128;
	d -= 128;
	}

	inline for (array[1..], 0..) \|vec, i\| dest[i * 32 ..][0..32].* = vec;
	dest[len - 32 ..][0..32].* = array[0];
	}

	const rt_memcpy = @extern(
	const fn (noalias ?[]u8, noalias ?[]const u8, len: usize) callconv(.C) ?[]u8,
	.{ .name = "memcpy", .linkage = .strong },
	);

	export fn protty_memcpy(noalias dst: ?[]u8, noalias src: ?[]const u8, len: usize) callconv(.C) ?[*]u8 {
	@setRuntimeSafety(false);

	if (len <= 16) {
	if (len < 4) {
	// handles sizes under a u32.
	if (len > 0) {
	copy_blocks(dst, src, 1, [_]usize{ 0, len / 2, len - 1 });
	}
	} else {
	// handles sizes from u32 to Vector(16).
	const mid = (len / 8) * 4;
	copy_blocks(dst, src, 4, [_]usize{ 0, mid, len - 4, len - 4 - mid });
	}
	} else if (len <= 64) {
	// handles sizes from Vector(16) to cache_line.
	const mid = (len / 32) * 16;
	copy_blocks(dst, src, 16, [_]usize{ 0, mid, len - 16, len - 16 - mid });
	} else {
	// handles sizes over a cache_line.
	for (0..(len - 1) / 128) \|i\| {
	std.mem.doNotOptimizeAway(i); // prevent LLVM from bloating loop via auto-vectorization.
	copy_blocks(dst, src, 128, [_]usize{i * 128});
	}
	copy_blocks(dst, src, 64, [_]usize{ len -\| 128, len - 64 });
	}

	return dst;
	}

	inline fn copy_blocks(dst: ?[]u8, src: ?[]const u8, comptime size: usize, offsets: anytype) void {
	var blocks: [offsets.len]@Vector(size, u8) = undefined; // Vector over [N]u8 for good codegen.
	for (offsets, &blocks) \|offset, block\| block. = src.?[offset..][0..size].*;
	for (offsets, blocks) \|offset, block\| dst.?[offset..][0..size].* = block;
	}

	const CopyType = if (std.simd.suggestVectorLength(u8)) \|vec_size\|
	@Type(.{ .Vector = .{
	.child = u8,
	.len = vec_size,
	} })
	else
	usize;

	export fn oak_memcpy(noalias dest: ?[]u8, noalias src: ?[]const u8, len: usize) callconv(.C) ?[*]u8 {
	if (len <= 16) {
	if (len <= 4) {
	if (len <= 2) {
	if (len == 0) return dest;
	memcpy_range2(1, dest.?, src.?, len);
	} else {
	memcpy_range2(2, dest.?, src.?, len);
	}
	} else if (len <= 8) {
	memcpy_range2(4, dest.?, src.?, len);
	} else {
	memcpy_range2(8, dest.?, src.?, len);
	}
	return dest;
	}

	const unroll_count = 2;
	const alignment = @alignOf(CopyType);
	const size = @sizeOf(CopyType);

	if (comptime 5 <= std.math.log2(unroll_count * size)) {
	inline for (5..std.math.log2(unroll_count * size) + 1) \|p\| {
	const limit = 1 << p;
	if (len <= limit) {
	memcpy_range2(limit / 2, dest.?, src.?, len);
	return dest;
	}
	}
	}

	std.debug.assert(unroll_count * size < len);

	// we know that `len > 2 * size` and `size >= alignment`
	// so we can safely align `s` to `alignment`
	dest.?[0..size].* = src.?[0..size].*;
	const alignment_offset = alignment - @intFromPtr(src.?) % alignment;
	const n = len - alignment_offset;
	const d = dest.? + alignment_offset;
	const s = src.? + alignment_offset;

	if (@intFromPtr(d) % alignment == 0) {
	memcpy_aligned(@alignCast(@ptrCast(d)), @alignCast(@ptrCast(s)), n, unroll_count);
	} else {
	memcpy_unaligned(@ptrCast(d), @alignCast(@ptrCast(s)), n, unroll_count);
	}

	dest.?[len - size ..][0..size].* = src.?[len - size ..][0..size].*;

	return dest;
	}

	// inline is needed to prevent llvm making an infinitely recursive call to memcpy
	inline fn memcpy_aligned(
	noalias dest: [*]CopyType,
	noalias src: [*]const CopyType,
	max_bytes: usize,
	comptime unroll_count: comptime_int,
	) void {
	memcpy_blocks(dest, src, max_bytes, unroll_count);
	}

	inline fn memcpy_unaligned(
	noalias dest: [*]align(1) CopyType,
	noalias src: [*]const CopyType,
	max_bytes: usize,
	comptime unroll_count: comptime_int,
	) void {
	memcpy_blocks(dest, src, max_bytes, unroll_count);
	}

	/// Copies a multiple of `@sizeOf(T)` bytes from `src` to `dest`, where `T` is
	/// the child type of `src` and `dest`. No more than `max_bytes` will be copied
	/// (`max_bytes` need not be a multiple of `@sizeOf(T)`) but `max_bytes` must
	/// be at least `@sizeOf(T)`.
	inline fn memcpy_blocks(
	noalias dest: anytype,
	noalias src: anytype,
	max_bytes: usize,
	comptime unroll_count: comptime_int,
	) void {
	comptime std.debug.assert(unroll_count > 0);

	const T = @typeInfo(@TypeOf(dest)).Pointer.child;
	comptime std.debug.assert(T == @typeInfo(@TypeOf(src)).Pointer.child);

	const loop_count = max_bytes / (@sizeOf(T) * unroll_count);

	for (0..loop_count) \|i\| {
	inline for (dest[i * unroll_count ..][0..unroll_count], src[i * unroll_count ..][0..unroll_count]) \|*d, s\| {
	d.* = s;
	}
	}

	const tail_start = (max_bytes / @sizeOf(T)) - (unroll_count - 1);
	inline for (dest[tail_start..][0 .. unroll_count - 1], src[tail_start..][0 .. unroll_count - 1]) \|*d, s\| {
	d.* = s;
	}
	}

	/// copy blocks of length `copy_len` from `src[0..len] to `dest[0..len]` at the
	/// start and end of those respective slices
	inline fn memcpy_range2(
	comptime copy_len: comptime_int,
	noalias dest: [*]u8,
	noalias src: [*]const u8,
	len: usize,
	) void {
	comptime std.debug.assert(std.math.isPowerOfTwo(copy_len));
	const size = @sizeOf(CopyType);

	const last = len - copy_len;
	if (copy_len > size) { // comptime-known
	// we do these copies 1 CopyType at a time to prevent llvm turning this into a call to memcpy
	const d: [*]align(1) CopyType = @ptrCast(dest);
	const s: [*]align(1) const CopyType = @ptrCast(src);
	const count = @divExact(copy_len, size);
	inline for (d[0..count], s[0..count]) \|*r, v\| {
	r.* = v;
	}
	const dl: [*]align(1) CopyType = @ptrCast(dest + last);
	const sl: [*]align(1) const CopyType = @ptrCast(src + last);
	inline for (dl[0..count], sl[0..count]) \|*r, v\| {
	r.* = v;
	}
	} else {
	dest[0..copy_len].* = src[0..copy_len].*;
	dest[last..][0..copy_len].* = src[last..][0..copy_len].*;
	}
	}

	/// X86 cycle counter
	inline fn rdtsc() usize {
	var a: u32 = undefined;
	var b: u32 = undefined;
	asm volatile ("rdtscp"
	: [a] "={edx}" (a),
	[b] "={eax}" (b),
	:
	: "ecx"
	);
	return (@as(u64, a) << 32) \| b;
	}

	inline fn clflush(comptime T: type, slice: []const T) void {
	for (0..(slice.len) / @bitSizeOf(T) * 8) \|chunk\| {
	const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8);
	asm volatile ("clflush %[ptr]"
	:
	: [ptr] "m" (offset),
	: "memory"
	);
	}
	}