Skip to content

Instantly share code, notes, and snippets.

@KodrAus
Created May 18, 2025 11:10
Show Gist options
  • Save KodrAus/f4be6b812b29ef421d8be20e365b628e to your computer and use it in GitHub Desktop.
Save KodrAus/f4be6b812b29ef421d8be20e365b628e to your computer and use it in GitHub Desktop.
A crummy little hashset
use std::collections::HashSet;
/**
A simple, reasonably space-efficient, stack-based datastructure that tracks whether a given string key has been seen before.
This is not a general-purpose hash set. It uses non-cryptographic FNV hashing and limited space, so is very prone to collisions.
If a collision is detected then the contents of the filter can be spilled into a regular `HashSet<&str>`.
For small numbers of values, it can be used to avoid or defer allocation.
The filter accepts two generic parameters:
- `VALUES`: The maximum number of values that can be stored in the filter.
- `SLOTS`: The number of slots in the filter. The more slots, the less likely collisions are. This value can be much larger than `VALUES`.
The number of unique values that can be stored before a collision is roughly logarithmic over `SLOTS`.
Annecdotally, 10 slots will observe a collision after ~3 values, 50 slots ~8, 100 slots ~12, 150 slots ~14, and 200 slots ~17.
*/
pub struct Filter<'a, const VALUES: usize, const SLOTS: usize> {
// Slots are more compact than values, so we can improve the quality of our filter
// by allowing space for many slots, even if only up to `VALUES` of them will ever
// be used
slots: [Slot; SLOTS],
// All values between `0..len` are guaranteed to be non-null
values: [Value<'a>; VALUES],
len: usize,
}
/**
A value in a `Filter`.
Values save a little space by only supporting strings with a length that fits in a `u8`.
*/
#[derive(Clone, Copy)]
struct Value<'a> {
ptr: *const u8,
len: u8,
_marker: std::marker::PhantomData<&'a str>,
}
impl<'a> Value<'a> {
/**
Try construct a value from a given string.
If the value is too big to fit then this method will return `None`.
*/
fn new(value: &'a str) -> Option<Self> {
if value.len() > u8::MAX as usize {
return None;
}
Some(Value {
ptr: value.as_bytes().as_ptr(),
len: value.as_bytes().len() as u8,
_marker: Default::default(),
})
}
/**
Construct an empty value.
It is not valid to call `get` on a null value.
*/
fn null() -> Self {
Value {
ptr: std::ptr::null::<u8>(),
len: 0,
_marker: Default::default(),
}
}
/**
Get the underlying string value.
# Safety
Callers must ensure the value is not null.
*/
unsafe fn as_str_unchecked(&self) -> &'a str {
unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len as usize))
}
}
}
/**
A slot in a `Filter`.
*/
#[derive(Clone, Copy)]
struct Slot {
idx: u8,
}
impl Slot {
/**
Create a new slot for a given index.
The index must be less than (and not equal to) `u8::MAX`.
*/
fn new(idx: usize) -> Self {
debug_assert!(idx < u8::MAX as usize);
Slot {
idx: idx as u8,
}
}
/**
Create a null slot.
*/
fn null() -> Self {
Slot {
idx: u8::MAX,
}
}
/**
Whether the slot is null.
It is valid to get the value of a null slot, but it won't be usable.
*/
fn is_null(&self) -> bool {
self.idx == u8::MAX
}
/**
Get the index stored in the slot.
*/
fn idx(&self) -> usize {
self.idx as usize
}
}
impl<'a, const VALUES: usize, const SLOTS: usize> Filter<'a, VALUES, SLOTS> {
/**
Create a new, empty filter.
*/
pub fn new() -> Self {
assert!(VALUES < u8::MAX as usize);
Filter {
slots: [Slot::null(); SLOTS],
values: [Value::null(); VALUES],
len: 0,
}
}
/**
Insert a value into the filter, returning `true` if this is the first time we've seen it, or `false` if it's not.
If a collision is detected, or the value is too big to fit in the filter, then this method returns `None`.
At this point, the filter needs to be spilled into a `HashSet<&str>`.
*/
pub fn insert(&mut self, value: &'a str) -> Option<bool> {
// Compute the FNV hash of the input
let mut hash = 0xcbf29ce484222325u64;
for b in value.as_bytes() {
hash = hash * 0x100000001b3u64;
hash = hash ^ (*b as u64);
}
let slot = &mut self.slots[(hash as usize) % SLOTS];
if slot.is_null() {
// This is the first time we're seeing this value
let Some(incoming) = Value::new(value) else {
// If the value is too large to fit in our filter then spill
return None;
};
let value = &mut self.values[self.len];
*value = incoming;
*slot = Slot::new(self.len);
self.len += 1;
Some(true)
} else {
// The value may have been seen before
let idx = slot.idx();
// SAFETY: `idx` points to a non-null value
if unsafe { self.values[idx].as_str_unchecked() } != value {
// If we found a collision then spill
return None;
}
Some(false)
}
}
/**
Spill the filter into a regular hash set that can be used to continue checking.
*/
pub fn spill(&self) -> HashSet<&'a str> {
let mut spilled = HashSet::<&'a str>::new();
for value in self.values[..self.len].iter() {
// SAFETY: all values up to `self.len` are guaranteed to be non-null
spilled.insert(unsafe { value.as_str_unchecked() });
}
spilled
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment