Last active
February 13, 2025 06:07
-
-
Save pps83/33704502b160708626c3de2d5a9bf6a9 to your computer and use it in GitHub Desktop.
umul128
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <intrin.h> | |
// based on XXH_mult64to128 from xxHash https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h#L4464 | |
uint64_t umul128(uint64_t x, uint64_t y, uint64_t* hi) | |
{ | |
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ | |
&& defined(__SIZEOF_INT128__) \ | |
|| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) | |
// gcc/clang __uint128_t | |
__uint128_t const product = (__uint128_t)x * (__uint128_t)y; | |
*hi = (uint64_t)(product >> 64); | |
return (uint64_t)(product); | |
#elif defined(_M_X64) // msvc _umul128 method for x64 | |
#ifndef _MSC_VER | |
# pragma intrinsic(_umul128) | |
#endif | |
return _umul128(x, y, hi); | |
#elif defined(_M_ARM64) || defined(_M_ARM64EC) // msvc __umulh method for arm64 | |
#ifndef _MSC_VER | |
# pragma intrinsic(__umulh) | |
#endif | |
* hi = __umulh(x, y); | |
return = x * y; | |
#else | |
/* | |
* Portable scalar method. Optimized for 32-bit and 64-bit ALUs. | |
* | |
* This is a fast and simple grade school multiply, which is shown below | |
* with base 10 arithmetic instead of base 0x100000000. | |
* | |
* 9 3 // D2 lhs = 93 | |
* x 7 5 // D2 rhs = 75 | |
* ---------- | |
* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 | |
* 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 | |
* 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 | |
* + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 | |
* --------- | |
* 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 | |
* + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 | |
* --------- | |
* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 | |
* | |
* The reasons for adding the products like this are: | |
* 1. It avoids manual carry tracking. Just like how | |
* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. | |
* This avoids a lot of complexity. | |
* | |
* 2. It hints for, and on Clang, compiles to, the powerful UMAAL | |
* instruction available in ARM's Digital Signal Processing extension | |
* in 32-bit ARMv6 and later, which is shown below: | |
* | |
* void UMAAL(uint32_t *RdLo, uint32_t *RdHi, uint32_t Rn, uint32_t Rm) | |
* { | |
* uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm; | |
* *RdLo = (uint32_t)(product & 0xFFFFFFFF); | |
* *RdHi = (uint32_t)(product >> 32); | |
* } | |
* | |
* This instruction was designed for efficient long multiplication, and | |
* allows this to be calculated in only 4 instructions at speeds | |
* comparable to some 64-bit ALUs. | |
* | |
* 3. It isn't terrible on other platforms. Usually this will be a couple | |
* of 32-bit ADD/ADCs. | |
*/ | |
#if defined(_MSC_VER) && defined(_M_IX86) | |
# define mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) | |
#else | |
/* | |
* Downcast + upcast is usually better than masking on older compilers like | |
* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. | |
* | |
* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands | |
* and perform a full 64x64 multiply -- entirely redundant on 32-bit. | |
*/ | |
# define mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y)) | |
#endif | |
/* First calculate all of the cross products. */ | |
uint64_t const lo_lo = mult32to64(x & 0xFFFFFFFF, y & 0xFFFFFFFF); | |
uint64_t const hi_lo = mult32to64(x >> 32, y & 0xFFFFFFFF); | |
uint64_t const lo_hi = mult32to64(x & 0xFFFFFFFF, y >> 32); | |
uint64_t const hi_hi = mult32to64(x >> 32, y >> 32); | |
/* Now add the products together. These will never overflow. */ | |
uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; | |
uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; | |
uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); | |
*hi = upper; | |
return lower; | |
#undef mult32to64 | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment