Skip to content

Instantly share code, notes, and snippets.

@Rexicon226
Created August 28, 2024 01:10
Show Gist options
  • Save Rexicon226/3e88371a56ee498b2888c12935eef2d2 to your computer and use it in GitHub Desktop.
Save Rexicon226/3e88371a56ee498b2888c12935eef2d2 to your computer and use it in GitHub Desktop.
const std = @import("std");
const iterations_per_byte = 1000;
const warmup_iterations = 10;
pub fn main() !void {
// Pin the process to a single core (1)
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
try std.os.linux.sched_setaffinity(0, &cpu0001);
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
const stdout = std.io.getStdOut().writer();
const loops = try std.process.argsAlloc(allocator);
defer std.process.argsFree(allocator, loops);
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);
const T = u8;
for (1..max_bytes) |index| {
const buffer_a = try allocator.alloc(T, index);
@memset(buffer_a, 0);
const buffer_b = try allocator.alloc(T, index);
@memset(buffer_a, 0xAA);
try stdout.print("{},", .{index});
inline for (.{
folly_memcpy,
rt_memcpy,
protty_memcpy,
oak_memcpy,
}) |func| {
clflush(T, buffer_a);
clflush(T, buffer_b);
var cycles: u64 = 0;
var i: u32 = 0;
while (i < iterations_per_byte + warmup_iterations) : (i += 1) {
const start = rdtsc();
std.mem.doNotOptimizeAway(func(buffer_a.ptr, buffer_b.ptr, index));
const end = rdtsc();
if (i > warmup_iterations) cycles += (end - start);
}
try stdout.print("{},", .{cycles / iterations_per_byte});
}
try stdout.print("\n", .{});
allocator.free(buffer_a);
allocator.free(buffer_b);
}
}
export fn folly_memcpy(maybe_dest: ?[*]u8, maybe_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
if (len == 0) {
@branchHint(.unlikely);
return maybe_dest;
}
const dest = maybe_dest.?;
const src = maybe_src.?;
if (len < 8) {
@branchHint(.unlikely);
if (len == 1) {
@branchHint(.unlikely);
dest[0] = src[0];
} else if (len >= 4) {
@branchHint(.unlikely);
blockCopy(dest, src, 4, len);
} else {
blockCopy(dest, src, 2, len);
}
return dest;
}
if (len > 32) {
@branchHint(.unlikely);
if (len > 256) {
@branchHint(.unlikely);
copyMove(dest, src, len);
return dest;
}
copyLong(dest, src, len);
return dest;
}
if (len > 16) {
@branchHint(.unlikely);
blockCopy(dest, src, 16, len);
return dest;
}
blockCopy(dest, src, 8, len);
return dest;
}
inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void {
const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*;
const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*;
dest[0..block_size].* = first;
dest[len - block_size ..][0..block_size].* = second;
}
inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void {
var array: [8]@Vector(32, u8) = undefined;
inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| {
array[i * 2] = src[(N / 2) - 32 ..][0..32].*;
array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*;
if (len <= N) {
@branchHint(.unlikely);
for (0..i + 1) |j| {
dest[j * 32 ..][0..32].* = array[j * 2];
dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1];
}
return;
}
}
}
inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void {
if (@intFromPtr(src) >= @intFromPtr(dest)) {
@branchHint(.unlikely);
copyForward(dest, src, len);
} else if (@intFromPtr(src) + len > @intFromPtr(dest)) {
@branchHint(.unlikely);
overlapBwd(dest, src, len);
} else {
copyForward(dest, src, len);
}
}
inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void {
const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*;
const N: usize = len & ~@as(usize, 127);
var i: usize = 0;
while (i < N) : (i += 128) {
dest[i..][0..32].* = src[i..][0..32].*;
dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*;
dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*;
dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*;
}
if (len - i <= 32) {
@branchHint(.unlikely);
dest[len - 32 ..][0..32].* = tail;
} else {
copyLong(dest[i..], src[i..], len - i);
}
}
inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void {
var array: [5]@Vector(32, u8) = undefined;
array[0] = src[len - 32 ..][0..32].*;
inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*;
const end: usize = (@intFromPtr(dest) + len - 32) & 31;
const range = len - end;
var s = src + range;
var d = dest + range;
while (@intFromPtr(s) > @intFromPtr(src + 128)) {
// zig fmt: off
const first = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*;
const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*;
const third = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*;
const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*;
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).* = first;
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).* = second;
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).* = third;
@as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth;
// zig fmt: on
s -= 128;
d -= 128;
}
inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec;
dest[len - 32 ..][0..32].* = array[0];
}
const rt_memcpy = @extern(
*const fn (noalias ?[*]u8, noalias ?[*]const u8, len: usize) callconv(.C) ?[*]u8,
.{ .name = "memcpy", .linkage = .strong },
);
export fn protty_memcpy(noalias dst: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
@setRuntimeSafety(false);
if (len <= 16) {
if (len < 4) {
// handles sizes under a u32.
if (len > 0) {
copy_blocks(dst, src, 1, [_]usize{ 0, len / 2, len - 1 });
}
} else {
// handles sizes from u32 to Vector(16).
const mid = (len / 8) * 4;
copy_blocks(dst, src, 4, [_]usize{ 0, mid, len - 4, len - 4 - mid });
}
} else if (len <= 64) {
// handles sizes from Vector(16) to cache_line.
const mid = (len / 32) * 16;
copy_blocks(dst, src, 16, [_]usize{ 0, mid, len - 16, len - 16 - mid });
} else {
// handles sizes over a cache_line.
for (0..(len - 1) / 128) |i| {
std.mem.doNotOptimizeAway(i); // prevent LLVM from bloating loop via auto-vectorization.
copy_blocks(dst, src, 128, [_]usize{i * 128});
}
copy_blocks(dst, src, 64, [_]usize{ len -| 128, len - 64 });
}
return dst;
}
inline fn copy_blocks(dst: ?[*]u8, src: ?[*]const u8, comptime size: usize, offsets: anytype) void {
var blocks: [offsets.len]@Vector(size, u8) = undefined; // Vector over [N]u8 for good codegen.
for (offsets, &blocks) |offset, *block| block.* = src.?[offset..][0..size].*;
for (offsets, blocks) |offset, block| dst.?[offset..][0..size].* = block;
}
const CopyType = if (std.simd.suggestVectorLength(u8)) |vec_size|
@Type(.{ .Vector = .{
.child = u8,
.len = vec_size,
} })
else
usize;
export fn oak_memcpy(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
if (len <= 16) {
if (len <= 4) {
if (len <= 2) {
if (len == 0) return dest;
memcpy_range2(1, dest.?, src.?, len);
} else {
memcpy_range2(2, dest.?, src.?, len);
}
} else if (len <= 8) {
memcpy_range2(4, dest.?, src.?, len);
} else {
memcpy_range2(8, dest.?, src.?, len);
}
return dest;
}
const unroll_count = 2;
const alignment = @alignOf(CopyType);
const size = @sizeOf(CopyType);
if (comptime 5 <= std.math.log2(unroll_count * size)) {
inline for (5..std.math.log2(unroll_count * size) + 1) |p| {
const limit = 1 << p;
if (len <= limit) {
memcpy_range2(limit / 2, dest.?, src.?, len);
return dest;
}
}
}
std.debug.assert(unroll_count * size < len);
// we know that `len > 2 * size` and `size >= alignment`
// so we can safely align `s` to `alignment`
dest.?[0..size].* = src.?[0..size].*;
const alignment_offset = alignment - @intFromPtr(src.?) % alignment;
const n = len - alignment_offset;
const d = dest.? + alignment_offset;
const s = src.? + alignment_offset;
if (@intFromPtr(d) % alignment == 0) {
memcpy_aligned(@alignCast(@ptrCast(d)), @alignCast(@ptrCast(s)), n, unroll_count);
} else {
memcpy_unaligned(@ptrCast(d), @alignCast(@ptrCast(s)), n, unroll_count);
}
dest.?[len - size ..][0..size].* = src.?[len - size ..][0..size].*;
return dest;
}
// inline is needed to prevent llvm making an infinitely recursive call to memcpy
inline fn memcpy_aligned(
noalias dest: [*]CopyType,
noalias src: [*]const CopyType,
max_bytes: usize,
comptime unroll_count: comptime_int,
) void {
memcpy_blocks(dest, src, max_bytes, unroll_count);
}
inline fn memcpy_unaligned(
noalias dest: [*]align(1) CopyType,
noalias src: [*]const CopyType,
max_bytes: usize,
comptime unroll_count: comptime_int,
) void {
memcpy_blocks(dest, src, max_bytes, unroll_count);
}
/// Copies a multiple of `@sizeOf(T)` bytes from `src` to `dest`, where `T` is
/// the child type of `src` and `dest`. No more than `max_bytes` will be copied
/// (`max_bytes` need not be a multiple of `@sizeOf(T)`) but `max_bytes` must
/// be at least `@sizeOf(T)`.
inline fn memcpy_blocks(
noalias dest: anytype,
noalias src: anytype,
max_bytes: usize,
comptime unroll_count: comptime_int,
) void {
comptime std.debug.assert(unroll_count > 0);
const T = @typeInfo(@TypeOf(dest)).Pointer.child;
comptime std.debug.assert(T == @typeInfo(@TypeOf(src)).Pointer.child);
const loop_count = max_bytes / (@sizeOf(T) * unroll_count);
for (0..loop_count) |i| {
inline for (dest[i * unroll_count ..][0..unroll_count], src[i * unroll_count ..][0..unroll_count]) |*d, s| {
d.* = s;
}
}
const tail_start = (max_bytes / @sizeOf(T)) - (unroll_count - 1);
inline for (dest[tail_start..][0 .. unroll_count - 1], src[tail_start..][0 .. unroll_count - 1]) |*d, s| {
d.* = s;
}
}
/// copy blocks of length `copy_len` from `src[0..len] to `dest[0..len]` at the
/// start and end of those respective slices
inline fn memcpy_range2(
comptime copy_len: comptime_int,
noalias dest: [*]u8,
noalias src: [*]const u8,
len: usize,
) void {
comptime std.debug.assert(std.math.isPowerOfTwo(copy_len));
const size = @sizeOf(CopyType);
const last = len - copy_len;
if (copy_len > size) { // comptime-known
// we do these copies 1 CopyType at a time to prevent llvm turning this into a call to memcpy
const d: [*]align(1) CopyType = @ptrCast(dest);
const s: [*]align(1) const CopyType = @ptrCast(src);
const count = @divExact(copy_len, size);
inline for (d[0..count], s[0..count]) |*r, v| {
r.* = v;
}
const dl: [*]align(1) CopyType = @ptrCast(dest + last);
const sl: [*]align(1) const CopyType = @ptrCast(src + last);
inline for (dl[0..count], sl[0..count]) |*r, v| {
r.* = v;
}
} else {
dest[0..copy_len].* = src[0..copy_len].*;
dest[last..][0..copy_len].* = src[last..][0..copy_len].*;
}
}
/// X86 cycle counter
inline fn rdtsc() usize {
var a: u32 = undefined;
var b: u32 = undefined;
asm volatile ("rdtscp"
: [a] "={edx}" (a),
[b] "={eax}" (b),
:
: "ecx"
);
return (@as(u64, a) << 32) | b;
}
inline fn clflush(comptime T: type, slice: []const T) void {
for (0..(slice.len) / @bitSizeOf(T) * 8) |chunk| {
const offset = slice.ptr + (chunk * @bitSizeOf(T) * 8);
asm volatile ("clflush %[ptr]"
:
: [ptr] "m" (offset),
: "memory"
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment