Created
June 19, 2021 18:21
-
-
Save rygorous/e4991ed243a3c7ffa58ab0d74c266baa to your computer and use it in GitHub Desktop.
Bytewise remapping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
typedef unsigned char U8; | |
typedef unsigned short U16; | |
typedef unsigned int U32; | |
typedef unsigned long long U64; | |
typedef intptr_t SINTa; | |
struct KernelState | |
{ | |
U8 *output; | |
const U8 *input; | |
const U8 *lut; | |
SINTa count; | |
}; | |
// Simple reference version | |
static bool remap_ref(KernelState *s) | |
{ | |
U8 * __restrict outp = s->output; | |
const U8 *inp = s->input; | |
const U8 *lut = s->lut; | |
SINTa count = s->count; | |
for (SINTa i = 0; i < count; i++) | |
outp[i] = lut[inp[i]]; | |
return true; | |
} | |
static bool remap_avx2(KernelState *s) | |
{ | |
// Set up remapping table | |
__m128i cur0 = _mm_setzero_si128(); | |
__m128i cur8 = _mm_setzero_si128(); | |
__m128i remap_tab[16]; | |
for (int i = 0; i < 8; i++) | |
{ | |
__m128i b0 = _mm_loadu_si128((const __m128i *) (s->lut + 0 + i*16)); | |
__m128i b8 = _mm_loadu_si128((const __m128i *) (s->lut + 128 + i*16)); | |
remap_tab[i + 0] = _mm_xor_si128(cur0, b0); | |
remap_tab[i + 8] = _mm_xor_si128(cur8, b8); | |
cur0 = b0; | |
cur8 = b8; | |
} | |
// Perform the remap | |
U8 * __restrict outp = s->output; | |
const U8 *inp = s->input; | |
SINTa count = s->count; | |
// NOTE: doesn't have tail handling yet | |
for (SINTa i = 0; i < count; i += 32) | |
{ | |
__m256i inds0 = _mm256_loadu_si256((const __m256i *) (inp + i)); | |
__m256i inds8 = _mm256_xor_si256(inds0, _mm256_set1_epi8(-0x80)); | |
// First pass | |
__m256i out0 = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[0]), inds0); | |
__m256i out8 = _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[8]), inds8); | |
#define PASS(n) \ | |
inds0 = _mm256_subs_epi8(inds0, _mm256_set1_epi8(0x10)); \ | |
inds8 = _mm256_subs_epi8(inds8, _mm256_set1_epi8(0x10)); \ | |
out0 = _mm256_xor_si256(out0, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[n+0]), inds0)); \ | |
out8 = _mm256_xor_si256(out8, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(remap_tab[n+8]), inds8)) | |
// Remaining passes | |
PASS(1); | |
PASS(2); | |
PASS(3); | |
PASS(4); | |
PASS(5); | |
PASS(6); | |
PASS(7); | |
#undef PASS | |
__m256i result = _mm256_or_si256(out0, out8); | |
_mm256_storeu_si256((__m256i *) (outp + i), result); | |
} | |
return true; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment