pps83 · February 13, 2025 06:07
diff --git a/umul128.cpp b/umul128.cpp
 #include <stdint.h>
 #include <intrin.h>

 // based on XXH_mult64to128 from xxHash https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h#L4464
 uint64_t umul128(uint64_t x, uint64_t y, uint64_t* hi)
 {
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
    && defined(__SIZEOF_INT128__) \
    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
    // gcc/clang __uint128_t
    __uint128_t const product = (__uint128_t)x * (__uint128_t)y;
    *hi = (uint64_t)(product >> 64);
    return (uint64_t)(product);
 #elif defined(_M_X64) // msvc _umul128 method for x64
 #ifndef _MSC_VER
 #   pragma intrinsic(_umul128)
 #endif
    return _umul128(x, y, hi);
 #elif defined(_M_ARM64) || defined(_M_ARM64EC) // msvc __umulh method for arm64
 #ifndef _MSC_VER
 #   pragma intrinsic(__umulh)
 #endif
    * hi = __umulh(x, y);
    return = x * y;
 #else
    /*
     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
     *
     * This is a fast and simple grade school multiply, which is shown below
     * with base 10 arithmetic instead of base 0x100000000.
     *
     *           9 3 // D2 lhs = 93
     *         x 7 5 // D2 rhs = 75
     *     ----------
     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
     *     ---------
     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
     *     ---------
     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
     *
     * The reasons for adding the products like this are:
     *  1. It avoids manual carry tracking. Just like how
     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
     *     This avoids a lot of complexity.
     *
     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
     *     instruction available in ARM's Digital Signal Processing extension
     *     in 32-bit ARMv6 and later, which is shown below:
     *
     *         void UMAAL(uint32_t *RdLo, uint32_t *RdHi, uint32_t Rn, uint32_t Rm)
     *         {
     *             uint64_t product = (uint64_t)*RdLo * (uint64_t)*RdHi + Rn + Rm;
     *             *RdLo = (uint32_t)(product & 0xFFFFFFFF);
     *             *RdHi = (uint32_t)(product >> 32);
     *         }
     *
     *     This instruction was designed for efficient long multiplication, and
     *     allows this to be calculated in only 4 instructions at speeds
     *     comparable to some 64-bit ALUs.
     *
     *  3. It isn't terrible on other platforms. Usually this will be a couple
     *     of 32-bit ADD/ADCs.
     */

 #if defined(_MSC_VER) && defined(_M_IX86)
 #    define mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
 #else
     /*
      * Downcast + upcast is usually better than masking on older compilers like
      * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
      *
      * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
      * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
      */
 #    define mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
 #endif

      /* First calculate all of the cross products. */
    uint64_t const lo_lo = mult32to64(x & 0xFFFFFFFF, y & 0xFFFFFFFF);
    uint64_t const hi_lo = mult32to64(x >> 32, y & 0xFFFFFFFF);
    uint64_t const lo_hi = mult32to64(x & 0xFFFFFFFF, y >> 32);
    uint64_t const hi_hi = mult32to64(x >> 32, y >> 32);
    /* Now add the products together. These will never overflow. */
    uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
    uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
    uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
    *hi = upper;
    return lower;
 #undef mult32to64
 #endif
 }
	#include <stdint.h>
	#include <intrin.h>

	// based on XXH_mult64to128 from xxHash https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h#L4464
	uint64_t umul128(uint64_t x, uint64_t y, uint64_t* hi)
	{
	#if (defined(__GNUC__) \|\| defined(__clang__)) && !defined(__wasm__) \
	&& defined(__SIZEOF_INT128__) \
	\|\| (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
	// gcc/clang __uint128_t
	__uint128_t const product = (__uint128_t)x * (__uint128_t)y;
	*hi = (uint64_t)(product >> 64);
	return (uint64_t)(product);
	#elif defined(_M_X64) // msvc _umul128 method for x64
	#ifndef _MSC_VER
	# pragma intrinsic(_umul128)
	#endif
	return _umul128(x, y, hi);
	#elif defined(_M_ARM64) \|\| defined(_M_ARM64EC) // msvc __umulh method for arm64
	#ifndef _MSC_VER
	# pragma intrinsic(__umulh)
	#endif
	* hi = __umulh(x, y);
	return = x * y;
	#else
	/*
	* Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
	*
	* This is a fast and simple grade school multiply, which is shown below
	* with base 10 arithmetic instead of base 0x100000000.
	*
	* 9 3 // D2 lhs = 93
	* x 7 5 // D2 rhs = 75
	* ----------
	* 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
	* 4 5 \| // D2 hi_lo = (93 / 10) * (75 % 10) = 45
	* 2 1 \| // D2 lo_hi = (93 % 10) * (75 / 10) = 21
	* + 6 3 \| \| // D2 hi_hi = (93 / 10) * (75 / 10) = 63
	* ---------
	* 2 7 \| // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
	* + 6 7 \| \| // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
	* ---------
	* 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
	*
	* The reasons for adding the products like this are:
	* 1. It avoids manual carry tracking. Just like how
	* (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
	* This avoids a lot of complexity.
	*
	* 2. It hints for, and on Clang, compiles to, the powerful UMAAL
	* instruction available in ARM's Digital Signal Processing extension
	* in 32-bit ARMv6 and later, which is shown below:
	*
	* void UMAAL(uint32_t RdLo, uint32_t RdHi, uint32_t Rn, uint32_t Rm)
	* {
	* uint64_t product = (uint64_t)RdLo (uint64_t)*RdHi + Rn + Rm;
	* *RdLo = (uint32_t)(product & 0xFFFFFFFF);
	* *RdHi = (uint32_t)(product >> 32);
	* }
	*
	* This instruction was designed for efficient long multiplication, and
	* allows this to be calculated in only 4 instructions at speeds
	* comparable to some 64-bit ALUs.
	*
	* 3. It isn't terrible on other platforms. Usually this will be a couple
	* of 32-bit ADD/ADCs.
	*/

	#if defined(_MSC_VER) && defined(_M_IX86)
	# define mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
	#else
	/*
	* Downcast + upcast is usually better than masking on older compilers like
	* GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
	*
	* The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
	* and perform a full 64x64 multiply -- entirely redundant on 32-bit.
	*/
	# define mult32to64(x, y) ((uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y))
	#endif

	/* First calculate all of the cross products. */
	uint64_t const lo_lo = mult32to64(x & 0xFFFFFFFF, y & 0xFFFFFFFF);
	uint64_t const hi_lo = mult32to64(x >> 32, y & 0xFFFFFFFF);
	uint64_t const lo_hi = mult32to64(x & 0xFFFFFFFF, y >> 32);
	uint64_t const hi_hi = mult32to64(x >> 32, y >> 32);
	/* Now add the products together. These will never overflow. */
	uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
	uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
	uint64_t const lower = (cross << 32) \| (lo_lo & 0xFFFFFFFF);
	*hi = upper;
	return lower;
	#undef mult32to64
	#endif
	}