Created
August 28, 2024 01:10
-
-
Save Rexicon226/3e88371a56ee498b2888c12935eef2d2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const iterations_per_byte = 1000; | |
const warmup_iterations = 10; | |
pub fn main() !void { | |
// Pin the process to a single core (1) | |
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1)); | |
try std.os.linux.sched_setaffinity(0, &cpu0001); | |
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | |
defer arena.deinit(); | |
const allocator = arena.allocator(); | |
const stdout = std.io.getStdOut().writer(); | |
const loops = try std.process.argsAlloc(allocator); | |
defer std.process.argsFree(allocator, loops); | |
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10); | |
const T = u8; | |
for (1..max_bytes) |index| { | |
const buffer_a = try allocator.alloc(T, index); | |
@memset(buffer_a, 0); | |
const buffer_b = try allocator.alloc(T, index); | |
@memset(buffer_a, 0xAA); | |
try stdout.print("{},", .{index}); | |
inline for (.{ | |
folly_memcpy, | |
rt_memcpy, | |
protty_memcpy, | |
oak_memcpy, | |
}) |func| { | |
clflush(T, buffer_a); | |
clflush(T, buffer_b); | |
var cycles: u64 = 0; | |
var i: u32 = 0; | |
while (i < iterations_per_byte + warmup_iterations) : (i += 1) { | |
const start = rdtsc(); | |
std.mem.doNotOptimizeAway(func(buffer_a.ptr, buffer_b.ptr, index)); | |
const end = rdtsc(); | |
if (i > warmup_iterations) cycles += (end - start); | |
} | |
try stdout.print("{},", .{cycles / iterations_per_byte}); | |
} | |
try stdout.print("\n", .{}); | |
allocator.free(buffer_a); | |
allocator.free(buffer_b); | |
} | |
} | |
export fn folly_memcpy(maybe_dest: ?[*]u8, maybe_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { | |
if (len == 0) { | |
@branchHint(.unlikely); | |
return maybe_dest; | |
} | |
const dest = maybe_dest.?; | |
const src = maybe_src.?; | |
if (len < 8) { | |
@branchHint(.unlikely); | |
if (len == 1) { | |
@branchHint(.unlikely); | |
dest[0] = src[0]; | |
} else if (len >= 4) { | |
@branchHint(.unlikely); | |
blockCopy(dest, src, 4, len); | |
} else { | |
blockCopy(dest, src, 2, len); | |
} | |
return dest; | |
} | |
if (len > 32) { | |
@branchHint(.unlikely); | |
if (len > 256) { | |
@branchHint(.unlikely); | |
copyMove(dest, src, len); | |
return dest; | |
} | |
copyLong(dest, src, len); | |
return dest; | |
} | |
if (len > 16) { | |
@branchHint(.unlikely); | |
blockCopy(dest, src, 16, len); | |
return dest; | |
} | |
blockCopy(dest, src, 8, len); | |
return dest; | |
} | |
inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void { | |
const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*; | |
const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*; | |
dest[0..block_size].* = first; | |
dest[len - block_size ..][0..block_size].* = second; | |
} | |
inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void { | |
var array: [8]@Vector(32, u8) = undefined; | |
inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| { | |
array[i * 2] = src[(N / 2) - 32 ..][0..32].*; | |
array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*; | |
if (len <= N) { | |
@branchHint(.unlikely); | |
for (0..i + 1) |j| { | |
dest[j * 32 ..][0..32].* = array[j * 2]; | |
dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1]; | |
} | |
return; | |
} | |
} | |
} | |
inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void { | |
if (@intFromPtr(src) >= @intFromPtr(dest)) { | |
@branchHint(.unlikely); | |
copyForward(dest, src, len); | |
} else if (@intFromPtr(src) + len > @intFromPtr(dest)) { | |
@branchHint(.unlikely); | |
overlapBwd(dest, src, len); | |
} else { | |
copyForward(dest, src, len); | |
} | |
} | |
inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void { | |
const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*; | |
const N: usize = len & ~@as(usize, 127); | |
var i: usize = 0; | |
while (i < N) : (i += 128) { | |
dest[i..][0..32].* = src[i..][0..32].*; | |
dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*; | |
dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*; | |
dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*; | |
} | |
if (len - i <= 32) { | |
@branchHint(.unlikely); | |
dest[len - 32 ..][0..32].* = tail; | |
} else { | |
copyLong(dest[i..], src[i..], len - i); | |
} | |
} | |
inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void { | |
var array: [5]@Vector(32, u8) = undefined; | |
array[0] = src[len - 32 ..][0..32].*; | |
inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*; | |
const end: usize = (@intFromPtr(dest) + len - 32) & 31; | |
const range = len - end; | |
var s = src + range; | |
var d = dest + range; | |
while (@intFromPtr(s) > @intFromPtr(src + 128)) { | |
// zig fmt: off | |
const first = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*; | |
const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*; | |
const third = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*; | |
const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*; | |
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).* = first; | |
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).* = second; | |
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).* = third; | |
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth; | |
// zig fmt: on | |
s -= 128; | |
d -= 128; | |
} | |
inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec; | |
dest[len - 32 ..][0..32].* = array[0]; | |
} | |
const rt_memcpy = @extern( | |
*const fn (noalias ?[*]u8, noalias ?[*]const u8, len: usize) callconv(.C) ?[*]u8, | |
.{ .name = "memcpy", .linkage = .strong }, | |
); | |
export fn protty_memcpy(noalias dst: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { | |
@setRuntimeSafety(false); | |
if (len <= 16) { | |
if (len < 4) { | |
// handles sizes under a u32. | |
if (len > 0) { | |
copy_blocks(dst, src, 1, [_]usize{ 0, len / 2, len - 1 }); | |
} | |
} else { | |
// handles sizes from u32 to Vector(16). | |
const mid = (len / 8) * 4; | |
copy_blocks(dst, src, 4, [_]usize{ 0, mid, len - 4, len - 4 - mid }); | |
} | |
} else if (len <= 64) { | |
// handles sizes from Vector(16) to cache_line. | |
const mid = (len / 32) * 16; | |
copy_blocks(dst, src, 16, [_]usize{ 0, mid, len - 16, len - 16 - mid }); | |
} else { | |
// handles sizes over a cache_line. | |
for (0..(len - 1) / 128) |i| { | |
std.mem.doNotOptimizeAway(i); // prevent LLVM from bloating loop via auto-vectorization. | |
copy_blocks(dst, src, 128, [_]usize{i * 128}); | |
} | |
copy_blocks(dst, src, 64, [_]usize{ len -| 128, len - 64 }); | |
} | |
return dst; | |
} | |
inline fn copy_blocks(dst: ?[*]u8, src: ?[*]const u8, comptime size: usize, offsets: anytype) void { | |
var blocks: [offsets.len]@Vector(size, u8) = undefined; // Vector over [N]u8 for good codegen. | |
for (offsets, &blocks) |offset, *block| block.* = src.?[offset..][0..size].*; | |
for (offsets, blocks) |offset, block| dst.?[offset..][0..size].* = block; | |
} | |
const CopyType = if (std.simd.suggestVectorLength(u8)) |vec_size| | |
@Type(.{ .Vector = .{ | |
.child = u8, | |
.len = vec_size, | |
} }) | |
else | |
usize; | |
export fn oak_memcpy(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 { | |
if (len <= 16) { | |
if (len <= 4) { | |
if (len <= 2) { | |
if (len == 0) return dest; | |
memcpy_range2(1, dest.?, src.?, len); | |
} else { | |
memcpy_range2(2, dest.?, src.?, len); | |
} | |
} else if (len <= 8) { | |
memcpy_range2(4, dest.?, src.?, len); | |
} else { | |
memcpy_range2(8, dest.?, src.?, len); | |
} | |
return dest; | |
} | |
const unroll_count = 2; | |
const alignment = @alignOf(CopyType); | |
const size = @sizeOf(CopyType); | |
if (comptime 5 <= std.math.log2(unroll_count * size)) { | |
inline for (5..std.math.log2(unroll_count * size) + 1) |p| { | |
const limit = 1 << p; | |
if (len <= limit) { | |
memcpy_range2(limit / 2, dest.?, src.?, len); | |
return dest; | |
} | |
} | |
} | |
std.debug.assert(unroll_count * size < len); | |
// we know that `len > 2 * size` and `size >= alignment` | |
// so we can safely align `s` to `alignment` | |
dest.?[0..size].* = src.?[0..size].*; | |
const alignment_offset = alignment - @intFromPtr(src.?) % alignment; | |
const n = len - alignment_offset; | |
const d = dest.? + alignment_offset; | |
const s = src.? + alignment_offset; | |
if (@intFromPtr(d) % alignment == 0) { | |
memcpy_aligned(@alignCast(@ptrCast(d)), @alignCast(@ptrCast(s)), n, unroll_count); | |
} else { | |
memcpy_unaligned(@ptrCast(d), @alignCast(@ptrCast(s)), n, unroll_count); | |
} | |
dest.?[len - size ..][0..size].* = src.?[len - size ..][0..size].*; | |
return dest; | |
} | |
// inline is needed to prevent llvm making an infinitely recursive call to memcpy | |
inline fn memcpy_aligned( | |
noalias dest: [*]CopyType, | |
noalias src: [*]const CopyType, | |
max_bytes: usize, | |
comptime unroll_count: comptime_int, | |
) void { | |
memcpy_blocks(dest, src, max_bytes, unroll_count); | |
} | |
inline fn memcpy_unaligned( | |
noalias dest: [*]align(1) CopyType, | |
noalias src: [*]const CopyType, | |
max_bytes: usize, | |
comptime unroll_count: comptime_int, | |
) void { | |
memcpy_blocks(dest, src, max_bytes, unroll_count); | |
} | |
/// Copies a multiple of `@sizeOf(T)` bytes from `src` to `dest`, where `T` is | |
/// the child type of `src` and `dest`. No more than `max_bytes` will be copied | |
/// (`max_bytes` need not be a multiple of `@sizeOf(T)`) but `max_bytes` must | |
/// be at least `@sizeOf(T)`. | |
inline fn memcpy_blocks( | |
noalias dest: anytype, | |
noalias src: anytype, | |
max_bytes: usize, | |
comptime unroll_count: comptime_int, | |
) void { | |
comptime std.debug.assert(unroll_count > 0); | |
const T = @typeInfo(@TypeOf(dest)).Pointer.child; | |
comptime std.debug.assert(T == @typeInfo(@TypeOf(src)).Pointer.child); | |
const loop_count = max_bytes / (@sizeOf(T) * unroll_count); | |
for (0..loop_count) |i| { | |
inline for (dest[i * unroll_count ..][0..unroll_count], src[i * unroll_count ..][0..unroll_count]) |*d, s| { | |
d.* = s; | |
} | |
} | |
const tail_start = (max_bytes / @sizeOf(T)) - (unroll_count - 1); | |
inline for (dest[tail_start..][0 .. unroll_count - 1], src[tail_start..][0 .. unroll_count - 1]) |*d, s| { | |
d.* = s; | |
} | |
} | |
/// copy blocks of length `copy_len` from `src[0..len] to `dest[0..len]` at the | |
/// start and end of those respective slices | |
inline fn memcpy_range2( | |
comptime copy_len: comptime_int, | |
noalias dest: [*]u8, | |
noalias src: [*]const u8, | |
len: usize, | |
) void { | |
comptime std.debug.assert(std.math.isPowerOfTwo(copy_len)); | |
const size = @sizeOf(CopyType); | |
const last = len - copy_len; | |
if (copy_len > size) { // comptime-known | |
// we do these copies 1 CopyType at a time to prevent llvm turning this into a call to memcpy | |
const d: [*]align(1) CopyType = @ptrCast(dest); | |
const s: [*]align(1) const CopyType = @ptrCast(src); | |
const count = @divExact(copy_len, size); | |
inline for (d[0..count], s[0..count]) |*r, v| { | |
r.* = v; | |
} | |
const dl: [*]align(1) CopyType = @ptrCast(dest + last); | |
const sl: [*]align(1) const CopyType = @ptrCast(src + last); | |
inline for (dl[0..count], sl[0..count]) |*r, v| { | |
r.* = v; | |
} | |
} else { | |
dest[0..copy_len].* = src[0..copy_len].*; | |
dest[last..][0..copy_len].* = src[last..][0..copy_len].*; | |
} | |
} | |
/// X86 cycle counter | |
inline fn rdtsc() usize { | |
var a: u32 = undefined; | |
var b: u32 = undefined; | |
asm volatile ("rdtscp" | |
: [a] "={edx}" (a), | |
[b] "={eax}" (b), | |
: | |
: "ecx" | |
); | |
return (@as(u64, a) << 32) | b; | |
} | |
inline fn clflush(comptime T: type, slice: []const T) void { | |
for (0..(slice.len) / @bitSizeOf(T) * 8) |chunk| { | |
const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8); | |
asm volatile ("clflush %[ptr]" | |
: | |
: [ptr] "m" (offset), | |
: "memory" | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment