Skip to content

Instantly share code, notes, and snippets.

@scottchiefbaker
Last active March 5, 2025 21:44
Show Gist options
  • Save scottchiefbaker/e4c87de8d6c1ad4a33b1ffe5a3ce707f to your computer and use it in GitHub Desktop.
Save scottchiefbaker/e4c87de8d6c1ad4a33b1ffe5a3ce707f to your computer and use it in GitHub Desktop.
Quicky benchmark to compare 32bit and 64bit PRNGs on an ESP32
#include <stdint.h>
static uint64_t s[8];
static uint32_t r[2];
static uint64_t sm;
static uint64_t fs[4];
// PCG uses a structure
typedef struct { uint64_t state; uint64_t inc; } pcg32_random_t;
pcg32_random_t rng;
uint64_t next_out = 0;
uint32_t count = 0;
void setup() {
Serial.begin(115200);
delay(1000);
// Init all the various global seeds for the PRNGs
for (int i = 0; i < 8; i++) {
s[i] = rdtsc_rand64();
}
for (int i = 0; i < 2; i++) {
r[i] = rdtsc_rand64();
}
for (int i = 0; i < 4; i++) {
fs[i] = rdtsc_rand64();
}
sm = rdtsc_rand64();
rng.state = rdtsc_rand64();
rng.inc = rdtsc_rand64();
next_out = millis() + 1000;
}
void loop() {
uint32_t md = 1000;
delay(5000);
next_out = millis() + md;
Serial.printf("\r\n");
//////////////////////////////////////////////////////////////
uint32_t num = 11;
while (next_out > millis()) {
num = xoroshiro64starstar();
count++;
}
Serial.printf("Generated %u x64** = %0.1f b/s\r\n", count, (count * 4.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
uint64_t num2 = 11;
while (next_out > millis()) {
num2 = xoshiro256plus();
count++;
}
Serial.printf("Generated %u x256+ = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 11;
while (next_out > millis()) {
num2 = xoshiro512plusplus();
count++;
}
Serial.printf("Generated %u x512++ = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 33;
while (next_out > millis()) {
num2 = splitmix64();
count++;
}
Serial.printf("Generated %u sm64 = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
while (next_out > millis()) {
num2 = pcg32();
count++;
}
Serial.printf("Generated %u pcg32 = %0.1f b/s\r\n", count, (count * 4.0 / (md / 1000.0)));
count = 0;
next_out = millis() + md;
//////////////////////////////////////////////////////////////
num2 = 44;
while (next_out > millis()) {
num2 = pcg64(&rng);
count++;
}
Serial.printf("Generated %u pcg64 = %0.1f b/s\r\n", count, (count * 8.0 / (md / 1000.0)));
count = 0;
next_out = millis() + 1000;
}
///////////////////////////////////////////////////////////
// rdtsc_rand
///////////////////////////////////////////////////////////
// Get the instruction counter for various CPU/Platforms
uint64_t get_rdtsc() {
#if defined(_WIN32) || defined(_WIN64)
return __rdtsc();
#elif defined(__aarch64__)
uint64_t count;
__asm__ volatile ("mrs %0, cntvct_el0" : "=r" (count));
return count;
#elif defined(ARDUINO)
return micros();
#elif defined(__GNUC__) || defined(__clang__)
uint32_t low, high;
__asm__ volatile ("rdtsc" : "=a"(low), "=d"(high));
return ((uint64_t)(high) << 32) | low;
#else
#error "Unsupported platform"
#endif
}
// Multiply-Shift Hash (Passes SmallCrush and PractRand up to 128GB)
static uint64_t hash_msh(uint64_t x) {
uint64_t prime = 0x9e3779b97f4a7c15; // A large prime constant
x ^= (x >> 30);
x *= prime;
x ^= (x >> 27);
x *= prime;
x ^= (x >> 31);
return x;
}
// Get an unsigned 64bit random integer
static uint64_t rdtsc_rand64() {
// Hash the rdtsc value through hash64
uint64_t rdtsc_val = get_rdtsc();
uint64_t ret = hash_msh(rdtsc_val);
return ret;
}
///////////////////////////////////////////////////////////
// PRNGs
///////////////////////////////////////////////////////////
static inline uint32_t rotl(const uint32_t x, int k) {
return (x << k) | (x >> (32 - k));
}
static inline uint64_t rotl(const uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
}
//////////////////////////////////////////////////////////////////
uint32_t xoroshiro64starstar(void) {
const uint32_t s0 = r[0];
uint32_t s1 = r[1];
const uint32_t result = rotl(s0 * 0x9E3779BB, 5) * 5;
s1 ^= s0;
r[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9); // a, b
r[1] = rotl(s1, 13); // c
return result;
}
//////////////////////////////////////////////////////////////////
uint64_t xoshiro256plus(void) {
const uint64_t result = fs[0] + fs[3];
const uint64_t t = fs[1] << 17;
fs[2] ^= fs[0];
fs[3] ^= fs[1];
fs[1] ^= fs[2];
fs[0] ^= fs[3];
fs[2] ^= t;
fs[3] = rotl(fs[3], 45);
return result;
}
//////////////////////////////////////////////////////////////////
uint64_t xoshiro512plusplus(void) {
const uint64_t result = rotl(s[0] + s[2], 17) + s[2];
const uint64_t t = s[1] << 11;
s[2] ^= s[0];
s[5] ^= s[1];
s[1] ^= s[2];
s[7] ^= s[3];
s[3] ^= s[4];
s[4] ^= s[5];
s[0] ^= s[6];
s[6] ^= s[7];
s[6] ^= t;
s[7] = rotl(s[7], 21);
return result;
}
//////////////////////////////////////////////////////////////////
uint64_t splitmix64() {
uint64_t z = (sm += 0x9e3779b97f4a7c15);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}
//////////////////////////////////////////////////////////////////
uint32_t pcg32() {
uint64_t oldstate = rng.state;
// Advance internal state
rng.state = oldstate * 6364136223846793005ULL + (rng.inc|1);
// Calculate output function (XSH RR), uses old state for max ILP
uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
uint32_t rot = oldstate >> 59u;
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
}
//////////////////////////////////////////////////////////////////
uint64_t pcg64(pcg32_random_t* rng) {
uint64_t high = pcg32();
uint32_t low = pcg32();
uint64_t ret = (high << 32) | low;
return ret;
}
@scottchiefbaker
Copy link
Author

On my 32bit ESP32-C3 I'm seeing:

PRNG Iterations per second Output Bits Bytes per second
pcg32 487802 32 1951266.7 b/s
xoroshiro64** 516023 32 2050966.7 b/s
xoshiro256+ 487808 64 3878726.7 b/s
xoshiro512++ 441735 64 3514373.3 b/s
splitmix64 462290 64 3677033.3 b/s
pcg64 416297 64 3313060.0 b/s

Very little difference on PRNGs that use 64bit operations vs 32bit operations. Even on limited hardware like this it makes sense to use a 64bit PRNG because you get more bytes per cycle.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment