Created
May 18, 2025 11:10
-
-
Save KodrAus/f4be6b812b29ef421d8be20e365b628e to your computer and use it in GitHub Desktop.
A crummy little hashset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::collections::HashSet; | |
/** | |
A simple, reasonably space-efficient, stack-based datastructure that tracks whether a given string key has been seen before. | |
This is not a general-purpose hash set. It uses non-cryptographic FNV hashing and limited space, so is very prone to collisions. | |
If a collision is detected then the contents of the filter can be spilled into a regular `HashSet<&str>`. | |
For small numbers of values, it can be used to avoid or defer allocation. | |
The filter accepts two generic parameters: | |
- `VALUES`: The maximum number of values that can be stored in the filter. | |
- `SLOTS`: The number of slots in the filter. The more slots, the less likely collisions are. This value can be much larger than `VALUES`. | |
The number of unique values that can be stored before a collision is roughly logarithmic over `SLOTS`. | |
Annecdotally, 10 slots will observe a collision after ~3 values, 50 slots ~8, 100 slots ~12, 150 slots ~14, and 200 slots ~17. | |
*/ | |
pub struct Filter<'a, const VALUES: usize, const SLOTS: usize> { | |
// Slots are more compact than values, so we can improve the quality of our filter | |
// by allowing space for many slots, even if only up to `VALUES` of them will ever | |
// be used | |
slots: [Slot; SLOTS], | |
// All values between `0..len` are guaranteed to be non-null | |
values: [Value<'a>; VALUES], | |
len: usize, | |
} | |
/** | |
A value in a `Filter`. | |
Values save a little space by only supporting strings with a length that fits in a `u8`. | |
*/ | |
#[derive(Clone, Copy)] | |
struct Value<'a> { | |
ptr: *const u8, | |
len: u8, | |
_marker: std::marker::PhantomData<&'a str>, | |
} | |
impl<'a> Value<'a> { | |
/** | |
Try construct a value from a given string. | |
If the value is too big to fit then this method will return `None`. | |
*/ | |
fn new(value: &'a str) -> Option<Self> { | |
if value.len() > u8::MAX as usize { | |
return None; | |
} | |
Some(Value { | |
ptr: value.as_bytes().as_ptr(), | |
len: value.as_bytes().len() as u8, | |
_marker: Default::default(), | |
}) | |
} | |
/** | |
Construct an empty value. | |
It is not valid to call `get` on a null value. | |
*/ | |
fn null() -> Self { | |
Value { | |
ptr: std::ptr::null::<u8>(), | |
len: 0, | |
_marker: Default::default(), | |
} | |
} | |
/** | |
Get the underlying string value. | |
# Safety | |
Callers must ensure the value is not null. | |
*/ | |
unsafe fn as_str_unchecked(&self) -> &'a str { | |
unsafe { | |
std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len as usize)) | |
} | |
} | |
} | |
/** | |
A slot in a `Filter`. | |
*/ | |
#[derive(Clone, Copy)] | |
struct Slot { | |
idx: u8, | |
} | |
impl Slot { | |
/** | |
Create a new slot for a given index. | |
The index must be less than (and not equal to) `u8::MAX`. | |
*/ | |
fn new(idx: usize) -> Self { | |
debug_assert!(idx < u8::MAX as usize); | |
Slot { | |
idx: idx as u8, | |
} | |
} | |
/** | |
Create a null slot. | |
*/ | |
fn null() -> Self { | |
Slot { | |
idx: u8::MAX, | |
} | |
} | |
/** | |
Whether the slot is null. | |
It is valid to get the value of a null slot, but it won't be usable. | |
*/ | |
fn is_null(&self) -> bool { | |
self.idx == u8::MAX | |
} | |
/** | |
Get the index stored in the slot. | |
*/ | |
fn idx(&self) -> usize { | |
self.idx as usize | |
} | |
} | |
impl<'a, const VALUES: usize, const SLOTS: usize> Filter<'a, VALUES, SLOTS> { | |
/** | |
Create a new, empty filter. | |
*/ | |
pub fn new() -> Self { | |
assert!(VALUES < u8::MAX as usize); | |
Filter { | |
slots: [Slot::null(); SLOTS], | |
values: [Value::null(); VALUES], | |
len: 0, | |
} | |
} | |
/** | |
Insert a value into the filter, returning `true` if this is the first time we've seen it, or `false` if it's not. | |
If a collision is detected, or the value is too big to fit in the filter, then this method returns `None`. | |
At this point, the filter needs to be spilled into a `HashSet<&str>`. | |
*/ | |
pub fn insert(&mut self, value: &'a str) -> Option<bool> { | |
// Compute the FNV hash of the input | |
let mut hash = 0xcbf29ce484222325u64; | |
for b in value.as_bytes() { | |
hash = hash * 0x100000001b3u64; | |
hash = hash ^ (*b as u64); | |
} | |
let slot = &mut self.slots[(hash as usize) % SLOTS]; | |
if slot.is_null() { | |
// This is the first time we're seeing this value | |
let Some(incoming) = Value::new(value) else { | |
// If the value is too large to fit in our filter then spill | |
return None; | |
}; | |
let value = &mut self.values[self.len]; | |
*value = incoming; | |
*slot = Slot::new(self.len); | |
self.len += 1; | |
Some(true) | |
} else { | |
// The value may have been seen before | |
let idx = slot.idx(); | |
// SAFETY: `idx` points to a non-null value | |
if unsafe { self.values[idx].as_str_unchecked() } != value { | |
// If we found a collision then spill | |
return None; | |
} | |
Some(false) | |
} | |
} | |
/** | |
Spill the filter into a regular hash set that can be used to continue checking. | |
*/ | |
pub fn spill(&self) -> HashSet<&'a str> { | |
let mut spilled = HashSet::<&'a str>::new(); | |
for value in self.values[..self.len].iter() { | |
// SAFETY: all values up to `self.len` are guaranteed to be non-null | |
spilled.insert(unsafe { value.as_str_unchecked() }); | |
} | |
spilled | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment