Skip to content

Instantly share code, notes, and snippets.

@tpruvot
Created October 29, 2016 19:27
Show Gist options
  • Save tpruvot/01014e7854d7e7841f8c90637dc38f1f to your computer and use it in GitHub Desktop.
Save tpruvot/01014e7854d7e7841f8c90637dc38f1f to your computer and use it in GitHub Desktop.
#include "common.h"
#include "blake2bcl.h"
#define tree0_ptr(heap, r) ((__global bucket0 *)(heap + r))
#define tree1_ptr(heap, r) ((__global bucket1 *)(heap + r))
uint32_t tree_bucket(tree t)
{
const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
return t & bucketMask;
}
uint32_t tree_slotid0(tree t)
{
const uint32_t slotMask = ((1u<<SLOTBITS)-1);
return (t >> BUCKBITS) & SLOTMASK;
}
uint32_t tree_slotid1(tree t)
{
const uint32_t slotMask = ((1u<<SLOTBITS)-1);
return (t >> (BUCKBITS+SLOTBITS)) & SLOTMASK;
}
uint32_t tree_xhash(tree t)
{
return t >> (2*SLOTBITS + BUCKBITS);
}
uint32_t tree_getindex(const tree t)
{
const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
const uint32_t slotMask = ((1u<<SLOTBITS)-1);
return ((t & bucketMask) << SLOTBITS) | ((t & (slotMask << BUCKBITS)) >> BUCKBITS);
}
void tree_setindex(tree *t, uint32_t idx)
{
const uint32_t bucketMask = ((1u<<BUCKBITS)-1);
const uint32_t slotMask = ((1u<<SLOTBITS)-1);
(*t) &= ~(bucketMask | (slotMask << BUCKBITS));
(*t) |= (idx >> SLOTBITS);
(*t) |= ((idx & slotMask) << BUCKBITS);
}
void tree_setxhash(tree *t, uint32_t xhash)
{
const uint32_t xhashMask = ((1u << RESTBITS)-1);
(*t) &= ~(xhashMask << (2*SLOTBITS + BUCKBITS));
(*t) |= (xhash << (2*SLOTBITS + BUCKBITS));
}
tree tree_create3(uint32_t bucketId, uint32_t s0, uint32_t s1)
{
return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS));
}
tree tree_create4(uint32_t bucketId, uint32_t s0, uint32_t s1, uint32_t xhash)
{
return bucketId | (s0 << BUCKBITS) | (s1 << (BUCKBITS+SLOTBITS)) | (xhash << (2*SLOTBITS+BUCKBITS));;
}
// size (in bytes) of hash in round 0 <= r < WK
uint32_t hashsize(const uint32_t r)
{
#ifdef XINTREE
const uint32_t hashbits = WN - (r+1) * DIGITBITS;
#else
const uint32_t hashbits = WN - (r+1) * DIGITBITS + RESTBITS;
#endif
return (hashbits + 7) / 8;
}
uint32_t hashwords(uint32_t bytes)
{
return (bytes + 3) / 4;
}
htlayout htlayout_create_2(uint32_t r)
{
htlayout R;
R.prevhashunits = 0;
R.dunits = 0;
uint32_t nexthashbytes = hashsize(r);
R.nexthashunits = hashwords(nexthashbytes);
R.prevbo = 0;
R.nextbo = R.nexthashunits * sizeof(hashunit) - nexthashbytes; // 0-3
if (r) {
uint32_t prevhashbytes = hashsize(r-1);
R.prevhashunits = hashwords(prevhashbytes);
R.prevbo = R.prevhashunits * sizeof(hashunit) - prevhashbytes; // 0-3
R.dunits = R.prevhashunits - R.nexthashunits;
}
return R;
}
uint32_t htlayout_getxhash0(uint32_t prevbo, __global const slot0 *pslot)
{
#ifdef XINTREE
return tree_xhash(pslot->attr);
#elif WN == 200 && RESTBITS == 4
return pslot->hash->bytes[prevbo] >> 4;
#elif WN == 200 && RESTBITS == 8
return (pslot->hash->bytes[prevbo] & 0xf) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
#elif WN == 144 && RESTBITS == 4
return pslot->hash->bytes[prevbo] & 0xf;
#elif WN == 200 && RESTBITS == 6
return (pslot->hash->bytes[prevbo] & 0x3) << 4 | pslot->hash->bytes[prevbo+1] >> 4;
#else
#error non implemented
#endif
}
uint32_t htlayout_getxhash1(uint32_t prevbo, __global const slot1 *pslot)
{
#ifdef XINTREE
return tree_xhash(pslot->attr);
#elif WN == 200 && RESTBITS == 4
return pslot->hash->bytes[prevbo] & 0xf;
#elif WN == 200 && RESTBITS == 8
return pslot->hash->bytes[prevbo];
#elif WN == 144 && RESTBITS == 4
return pslot->hash->bytes[prevbo] & 0xf;
#elif WN == 200 && RESTBITS == 6
return pslot->hash->bytes[prevbo] & 0x3f;
#else
#error non implemented
#endif
}
bool htlayout_equal(uint32_t prevhashunits, __global const hashunit *hash0, __global const hashunit *hash1)
{
return hash0[prevhashunits-1].word == hash1[prevhashunits-1].word;
}
void collisiondata_clear(collisiondata *data)
{
#ifdef XBITMAP
// memset(xhashmap, 0, NRESTS * sizeof(u64));
for (unsigned i = 0; i < NRESTS; i++)
data->xhashmap[i] = 0;
#else
// memset(nxhashslots, 0, NRESTS * sizeof(xslot));
for (unsigned i = 0; i < NRESTS; i++)
data->nxhashslots[i] = 0;
#endif
}
bool collisiondata_addslot(collisiondata *data, uint32_t s1, uint32_t xh)
{
#ifdef XBITMAP
data->xmap = data->xhashmap[xh];
data->xhashmap[xh] |= (uint64_t)1 << s1;
data->s0 = ~0;
return true;
#else
data->n1 = (uint32_t)data->nxhashslots[xh]++;
if (data->n1 >= XFULL)
return false;
data->xx = data->xhashslots[xh];
data->xx[data->n1] = s1;
data->n0 = 0;
return true;
#endif
}
bool collisiondata_nextcollision(collisiondata *data)
{
#ifdef XBITMAP
return data->xmap != 0;
#else
return data->n0 < data->n1;
#endif
}
uint64_t __ffsll(uint64_t x)
{
return x ? (64 - clz(x & -x)) : 0;
}
uint32_t collisiondata_slot(collisiondata *data) {
#ifdef XBITMAP
const uint32_t ffs = __ffsll(xmap);
data->s0 += ffs;
data->xmap >>= ffs;
return data->s0;
#else
return (uint32_t)data->xx[data->n0++];
#endif
}
uint32_t equi_getnslots(__global bsizes *nslots, const uint32_t r, const uint32_t bid)
{
__global uint32_t *nslot = &nslots[r&1][bid];
const uint32_t n = min(*nslot, NSLOTS);
*nslot = 0;
return n;
}
void equi_orderindices(__global uint32_t *indices, uint32_t size)
{
if (indices[0] > indices[size]) {
for (uint32_t i = 0; i < size; i++) {
const uint32_t tmp = indices[i];
indices[i] = indices[size+i];
indices[size+i] = tmp;
}
}
}
void local_orderindices(uint32_t *indices, uint32_t size)
{
if (indices[0] > indices[size]) {
for (uint32_t i = 0; i < size; i++) {
const uint32_t tmp = indices[i];
indices[i] = indices[size+i];
indices[size+i] = tmp;
}
}
}
void equi_listindices1(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
const uint32_t size = 1 << 0;
indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr);
indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
equi_orderindices(indices, size);
}
void equi_listindices2(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
const uint32_t size = 1 << 1;
equi_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices3(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
const uint32_t size = 1 << 2;
equi_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices4(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
const uint32_t size = 1 << 3;
equi_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices5(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
const uint32_t size = 1 << 4;
equi_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices6(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
const uint32_t size = 1 << 5;
equi_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices7(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
const uint32_t size = 1 << 6;
equi_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices8(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
const uint32_t size = 1 << 7;
equi_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void equi_listindices9(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
__global uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
const uint32_t size = 1 << 8;
equi_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
equi_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
equi_orderindices(indices, size);
}
void local_listindices1(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 0)[tree_bucket(t)];
const uint32_t size = 1 << 0;
indices[0] = tree_getindex((*buck)[tree_slotid0(t)].attr);
indices[size] = tree_getindex((*buck)[tree_slotid1(t)].attr);
local_orderindices(indices, size);
}
void local_listindices2(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 0)[tree_bucket(t)];
const uint32_t size = 1 << 1;
local_listindices1(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices1(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices3(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 1)[tree_bucket(t)];
const uint32_t size = 1 << 2;
local_listindices2(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices2(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices4(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 1)[tree_bucket(t)];
const uint32_t size = 1 << 3;
local_listindices3(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices3(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices5(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 2)[tree_bucket(t)];
const uint32_t size = 1 << 4;
local_listindices4(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices4(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices6(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 2)[tree_bucket(t)];
const uint32_t size = 1 << 5;
local_listindices5(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices5(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices7(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 3)[tree_bucket(t)];
const uint32_t size = 1 << 6;
local_listindices6(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices6(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices8(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket1 *buck = &tree1_ptr(heap1, 3)[tree_bucket(t)];
const uint32_t size = 1 << 7;
local_listindices7(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices7(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
void local_listindices9(__global uint32_t *heap0,
__global uint32_t *heap1,
const tree t,
uint32_t *indices)
{
const __global bucket0 *buck = &tree0_ptr(heap0, 4)[tree_bucket(t)];
const uint32_t size = 1 << 8;
local_listindices8(heap0, heap1, (*buck)[tree_slotid0(t)].attr, indices);
local_listindices8(heap0, heap1, (*buck)[tree_slotid1(t)].attr, indices+size);
local_orderindices(indices, size);
}
// proper dupe test is a little costly on GPU, so allow false negatives
bool equi_probdupe(uint32_t *prf) {
unsigned short susp[PROOFSIZE];
for (unsigned i = 0; i < PROOFSIZE; i++)
susp[i] = 0xFFFF;
for (unsigned i = 0; i < PROOFSIZE; i++) {
uint32_t bin = prf[i] & (PROOFSIZE-1);
unsigned short msb = prf[i] >> WK;
if (msb == susp[bin])
return true;
susp[bin] = msb;
}
return false;
}
void equi_candidate(__global uint32_t *heap0,
__global uint32_t *heap1,
__global proof *sols,
__global uint32_t *nsols,
const tree t)
{
proof prf;
#if WK==9
local_listindices9(heap0, heap1, t, (uint32_t*)&prf);
#elif WK==5
local_listindices5(heap0, heap1, t, (uint32_t*)&prf);
#else
#error not implemented
#endif
if (equi_probdupe(prf))
return;
uint32_t soli = atomic_inc(nsols);
if (soli < MAXSOLS)
#if WK==9
equi_listindices9(heap0, heap1, t, sols[soli]);
#elif WK==5
equi_listindices5(heap0, heap1, t, sols[soli]);
#else
#error not implemented
#endif
}
__kernel void digitH(__global blake2b_state *blake2bState,
__global const uint32_t *heap0,
__global bsizes *nslots)
{
uint8_t hash[HASHOUT];
blake2b_state state;
// equi::htlayout htl(eq, 0);
htlayout htl = htlayout_create_2(0);
const uint32_t hashbytes = hashsize(0);
// const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t id = get_global_id(0);
for (uint32_t block = id; block < NBLOCKS; block += get_global_size(0)) {
state = *blake2bState;
blake2b_gpu_hash(&state, block, hash, HASHOUT);
for (uint32_t i = 0; i < HASHESPERBLAKE; i++) {
const uint8_t *ph = hash + i * WN/8;
#if BUCKBITS == 16 && RESTBITS == 4
const uint32_t bucketid = ((uint32_t)ph[0] << 8) | ph[1];
#ifdef XINTREE
const uint32_t xhash = ph[2] >> 4;
#endif
#elif BUCKBITS == 14 && RESTBITS == 6
const uint32_t bucketid = ((uint32_t)ph[0] << 6) | ph[1] >> 2;
#elif BUCKBITS == 12 && RESTBITS == 8
const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
#elif BUCKBITS == 20 && RESTBITS == 4
const uint32_t bucketid = ((((uint32_t)ph[0] << 8) | ph[1]) << 4) | ph[2] >> 4;
#ifdef XINTREE
const uint32_t xhash = ph[2] & 0xf;
#endif
#elif BUCKBITS == 12 && RESTBITS == 4
const uint32_t bucketid = ((uint32_t)ph[0] << 4) | ph[1] >> 4;
const uint32_t xhash = ph[1] & 0xf;
#else
#error not implemented
#endif
const uint32_t slot = atomic_inc(&nslots[0][bucketid]);
if (slot >= NSLOTS)
continue;
tree leaf;
tree_setindex(&leaf, block*HASHESPERBLAKE+i);
#ifdef XINTREE
tree_setxhash(&leaf, xhash);
#endif
__global slot0 *s = &tree0_ptr(heap0, 0)[bucketid][slot];
s->attr = leaf;
// memcpy(s.hash->bytes+htl.nextbo, ph+WN/8-hashbytes, hashbytes);
for (unsigned i = 0; i < hashbytes; i++)
((__global uint8_t*)s->hash->bytes+htl.nextbo)[i] = ((uint8_t*)(ph+WN/8-hashbytes))[i];
}
}
}
__kernel void digitOdd(const uint32_t r,
__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots)
{
// equi::htlayout htl(eq, r);
// htlayout htl = htlayout_create(eq, r);
htlayout htl = htlayout_create_2(r);
collisiondata cd;
// const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
// cd.clear();
collisiondata_clear(&cd);
// __global slot0 *buck = htl.hta.trees0[(r-1)/2][bucketid]; // optimize by updating previous buck?!
__global slot0 *buck = tree0_ptr(heap0, (r-1)/2)[bucketid]; // optimize by updating previous buck?!
uint32_t bsize = equi_getnslots(nslots, r-1, bucketid); // optimize by putting bucketsize with block?!
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot0 *pslot1 = buck + s1; // optimize by updating previous pslot1?!
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot0 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
| (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
xhash &= 0xf;
#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
| (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
| (xhash = bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
xhash &= 0xf;
#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
| (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
xhash &= 0xf;
#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) & 0xf) << 8)
| (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 2
| (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 6;
#else
#error not implemented
#endif
const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
#ifdef XINTREE
tree xort = tree_create4(bucketid, s0, s1, xhash);
#else
tree xort = tree_create3(bucketid, s0, s1);
#endif
// __global slot1 *xs = &htl.hta.trees1[r/2][xorbucketid][xorslot];
__global slot1 *xs = &tree1_ptr(heap1, r/2)[xorbucketid][xorslot];
xs->attr = xort;
for (uint32_t i = htl.dunits; i < htl.prevhashunits; i++)
xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
}
}
}
}
__kernel void digitEven(const uint32_t r,
__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots)
{
// equi::htlayout htl(eq, r);
// htlayout htl = htlayout_create(eq, r);
htlayout htl = htlayout_create_2(r);
collisiondata cd;
// const uint32_t id = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
// cd.clear();
collisiondata_clear(&cd);
// __global slot1 *buck = htl.hta.trees1[(r-1)/2][bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
__global slot1 *buck = tree1_ptr(heap1, (r-1)/2)[bucketid]; // OPTIMIZE BY UPDATING PREVIOUS
uint32_t bsize = equi_getnslots(nslots, r-1, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot1 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
#if WN == 200 && BUCKBITS == 16 && RESTBITS == 4 && defined(XINTREE)
xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
#elif WN == 144 && BUCKBITS == 20 && RESTBITS == 4
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 8)
| (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2])) << 4)
| (bytes0[htl.prevbo+3] ^ bytes1[htl.prevbo+3]) >> 4;
#elif WN == 96 && BUCKBITS == 12 && RESTBITS == 4
xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 4)
| (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
#elif WN == 200 && BUCKBITS == 14 && RESTBITS == 6
xorbucketid = ((uint32_t)(bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]) << 6)
| (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 2;
#else
#error not implemented
#endif
const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
#ifdef XINTREE
tree xort = tree_create4(bucketid, s0, s1, xhash);
#else
tree xort = tree_create3(bucketid, s0, s1);
#endif
// __global slot0 *xs = &htl.hta.trees0[r/2][xorbucketid][xorslot];
__global slot0 *xs = &tree0_ptr(heap0, r/2)[xorbucketid][xorslot];
xs->attr = xort;
for (uint32_t i=htl.dunits; i < htl.prevhashunits; i++)
xs->hash[i-htl.dunits].word = pslot0->hash[i].word ^ pslot1->hash[i].word;
}
}
}
}
#ifdef UNROLL
__kernel void digit_1(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots)
{
htlayout htl = htlayout_create_2(1);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
__global slot0 *buck = tree0_ptr(heap0, 0)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 0, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot0 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot0 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
| (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
xhash &= 0xf;
const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot1 *xs = &htl.hta.trees1[0][xorbucketid][xorslot];
__global slot1 *xs = &tree1_ptr(heap1, 0)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
xs->hash[4].word = pslot0->hash[5].word ^ pslot1->hash[5].word;
}
}
}
}
__kernel void digit_2(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(2);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot1 *buck = htl.hta.trees1[0][bucketid];
__global slot1 *buck = tree1_ptr(heap1, 0)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 1, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot1 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot1 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot0 *xs = &htl.hta.trees0[1][xorbucketid][xorslot];
__global slot0 *xs = &tree0_ptr(heap0, 1)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
xs->hash[4].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
}
}
}
}
__kernel void digit_3(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(3);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot0 *buck = htl.hta.trees0[1][bucketid];
__global slot0 *buck = tree0_ptr(heap0, 1)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 2, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot0 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot0 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
| (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
xhash &= 0xf;
const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot1 *xs = &htl.hta.trees1[1][xorbucketid][xorslot];
__global slot1 *xs = &tree1_ptr(heap1, 1)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
xs->hash[3].word = pslot0->hash[4].word ^ pslot1->hash[4].word;
}
}
}
}
__kernel void digit_4(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(4);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot1 *buck = htl.hta.trees1[1][bucketid];
__global slot1 *buck = tree1_ptr(heap1, 1)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 3, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot1 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot1 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot0 *xs = &htl.hta.trees0[2][xorbucketid][xorslot];
__global slot0 *xs = &tree0_ptr(heap0, 2)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
xs->hash[2].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
xs->hash[3].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
}
}
}
}
__kernel void digit_5(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(5);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot0 *buck = htl.hta.trees0[2][bucketid];
__global slot0 *buck = tree0_ptr(heap0, 2)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 4, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot0 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot0 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
| (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
xhash &= 0xf;
const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot1 *xs = &htl.hta.trees1[2][xorbucketid][xorslot];
__global slot1 *xs = &tree1_ptr(heap1, 2)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
xs->hash[2].word = pslot0->hash[3].word ^ pslot1->hash[3].word;
}
}
}
}
__kernel void digit_6(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(6);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot1 *buck = htl.hta.trees1[2][bucketid];
__global slot1 *buck = tree1_ptr(heap1, 2)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 5, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot1 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot1 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot0 *xs = &htl.hta.trees0[3][xorbucketid][xorslot];
__global slot0 *xs = &tree0_ptr(heap0, 3)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
xs->hash[1].word = pslot0->hash[2].word ^ pslot1->hash[2].word;
}
}
}
}
__kernel void digit_7(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(7);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot0 *buck = htl.hta.trees0[3][bucketid];
__global slot0 *buck = tree0_ptr(heap0, 3)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 6, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot0 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot0 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) & 0xf) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1])) << 4
| (xhash = bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
xhash &= 0xf;
const uint32_t xorslot = atomic_inc(&nslots[1][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot1 *xs = &htl.hta.trees1[3][xorbucketid][xorslot];
__global slot1 *xs = &tree1_ptr(heap1, 3)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[0].word ^ pslot1->hash[0].word;
xs->hash[1].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
}
}
}
}
__kernel void digit_8(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots) {
htlayout htl = htlayout_create_2(8);
collisiondata cd;
const uint32_t id = get_global_id(0);
for (uint32_t bucketid=id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
// __global slot1 *buck = htl.hta.trees1[3][bucketid];
__global slot1 *buck = tree1_ptr(heap1, 3)[bucketid];
uint32_t bsize = equi_getnslots(nslots, 7, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot1 *pslot1 = buck + s1; // OPTIMIZE BY UPDATING PREVIOUS
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash1(htl.prevbo, pslot1)))
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot1 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash))
continue;
uint32_t xorbucketid;
uint32_t xhash;
__global const uint8_t *bytes0 = pslot0->hash->bytes, *bytes1 = pslot1->hash->bytes;
xorbucketid = ((uint32_t)(bytes0[htl.prevbo] ^ bytes1[htl.prevbo]) << 8)
| (bytes0[htl.prevbo+1] ^ bytes1[htl.prevbo+1]);
xhash = (bytes0[htl.prevbo+2] ^ bytes1[htl.prevbo+2]) >> 4;
const uint32_t xorslot = atomic_inc(&nslots[0][xorbucketid]);
if (xorslot >= NSLOTS)
continue;
tree xort = tree_create4(bucketid, s0, s1, xhash);
// __global slot0 *xs = &htl.hta.trees0[4][xorbucketid][xorslot];
__global slot0 *xs = &tree0_ptr(heap0, 4)[xorbucketid][xorslot];
xs->attr = xort;
xs->hash[0].word = pslot0->hash[1].word ^ pslot1->hash[1].word;
}
}
}
}
#endif //UNROLL
__kernel void digitK(__global uint32_t *heap0,
__global uint32_t *heap1,
__global bsizes *nslots,
__global proof *sols,
__global uint32_t *nsols) {
collisiondata cd;
htlayout htl = htlayout_create_2(WK);
const uint32_t id = get_global_id(0);
for (uint32_t bucketid = id; bucketid < NBUCKETS; bucketid += get_global_size(0)) {
collisiondata_clear(&cd);
__global slot0 *buck = tree0_ptr(heap0, (WK-1)/2)[bucketid];
uint32_t bsize = equi_getnslots(nslots, WK-1, bucketid);
for (uint32_t s1 = 0; s1 < bsize; s1++) {
__global const slot0 *pslot1 = buck + s1;
if (!collisiondata_addslot(&cd, s1, htlayout_getxhash0(htl.prevbo, pslot1))) // assume WK odd
continue;
for (; collisiondata_nextcollision(&cd); ) {
const uint32_t s0 = collisiondata_slot(&cd);
__global const slot0 *pslot0 = buck + s0;
if (htlayout_equal(htl.prevhashunits, pslot0->hash, pslot1->hash)) {
tree xort = tree_create3(bucketid, s0, s1);
equi_candidate(heap0, heap1, sols, nsols, xort);
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment