Skip to content

Instantly share code, notes, and snippets.

@pps83
Last active February 13, 2025 06:07
Show Gist options
  • Save pps83/33704502b160708626c3de2d5a9bf6a9 to your computer and use it in GitHub Desktop.
Save pps83/33704502b160708626c3de2d5a9bf6a9 to your computer and use it in GitHub Desktop.
umul128
#include <stdint.h>
#include <intrin.h>
// based on XXH_mult64to128 from xxHash https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h#L4464
uint64_t umul128(uint64_t x, uint64_t y, uint64_t* hi)
{
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
&& defined(__SIZEOF_INT128__) \
|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
// gcc/clang __uint128_t
__uint128_t const product = (__uint128_t)x * (__uint128_t)y;
*hi = (uint64_t)(product >> 64);
return (uint64_t)(product);
#elif defined(_M_X64) // msvc _umul128 method for x64
#ifndef _MSC_VER
# pragma intrinsic(_umul128)
#endif
return _umul128(x, y, hi);
#elif defined(_M_ARM64) || defined(_M_ARM64EC) // msvc __umulh method for arm64
#ifndef _MSC_VER
# pragma intrinsic(__umulh)
#endif
* hi = __umulh(x, y);
return = x * y;
#else
/*
* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
*
* This is a fast and simple grade school multiply, which is shown below
* with base 10 arithmetic instead of base 0x100000000.
*
* 9 3 // D2 lhs = 93
* x 7 5 // D2 rhs = 75
* ----------
* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
* ---------
* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
* ---------
* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
*
* The reasons for adding the products like this are:
* 1. It avoids manual carry tracking. Just like how
* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
* This avoids a lot of complexity.
*
* 2. It hints for, and on Clang, compiles to, the powerful UMAAL
* instruction available in ARM's Digital Signal Processing extension
* in 32-bit ARMv6 and later, which is shown below:
*
* void UMAAL(uint32_t *RdLo, uint32_t *RdHi, uint32_t Rn, uint32_t Rm)
* {
* uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm;
* *RdLo = (uint32_t)(product & 0xFFFFFFFF);
* *RdHi = (uint32_t)(product >> 32);
* }
*
* This instruction was designed for efficient long multiplication, and
* allows this to be calculated in only 4 instructions at speeds
* comparable to some 64-bit ALUs.
*
* 3. It isn't terrible on other platforms. Usually this will be a couple
* of 32-bit ADD/ADCs.
*/
#if defined(_MSC_VER) && defined(_M_IX86)
# define mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
#else
/*
* Downcast + upcast is usually better than masking on older compilers like
* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
*
* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
* and perform a full 64x64 multiply -- entirely redundant on 32-bit.
*/
# define mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
#endif
/* First calculate all of the cross products. */
uint64_t const lo_lo = mult32to64(x & 0xFFFFFFFF, y & 0xFFFFFFFF);
uint64_t const hi_lo = mult32to64(x >> 32, y & 0xFFFFFFFF);
uint64_t const lo_hi = mult32to64(x & 0xFFFFFFFF, y >> 32);
uint64_t const hi_hi = mult32to64(x >> 32, y >> 32);
/* Now add the products together. These will never overflow. */
uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
*hi = upper;
return lower;
#undef mult32to64
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment