Created
September 7, 2021 08:15
-
-
Save dalibor-drgon/006d6907189501d0ee20c455c085fbb9 to your computer and use it in GitHub Desktop.
Reciprocal algorithms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Copyright (c) 2015, Intel Corporation Redistribution and use in source and | |
binary forms, with or without modification, are permitted provided that the | |
following conditions are met: | |
* Redistributions of source code must retain the above copyright notice, this | |
list of conditions and the following disclaimer. | |
* Redistributions in binary form must reproduce the above copyright notice, | |
this list of conditions and the following disclaimer in the documentation | |
and/or other materials provided with the distribution. | |
* Neither the name of Intel Corporation nor the names of its contributors may | |
be used to endorse or promote products derived from this software without | |
specific prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ | |
/* | |
**************************************************************************** | |
**************************************************************************** | |
** THIS FILE CONTAINS EMULATION ROUTINES FOR THE UNDERLYING ALGORITHMS OF:** | |
** VRCP14PD Compute Approximate Reciprocals of Packed Float64 Values ** | |
** with relative error of less than 2^(-14) ** | |
** VRCP14SD Compute Approximate Reciprocal of Scalar Float64 Value ** | |
** with relative error of less than 2^(-14) ** | |
** VRCP14PS Compute Approximate Reciprocals of Packed Float32 Values ** | |
** with relative error of less than 2^(-14) ** | |
** VRCP14SS Compute Approximate Reciprocal of Scalar Float32 Value ** | |
** with relative error of less than 2^(-14) ** | |
** VRSQRT14PD Compute Approximate Reciprocals of Square Roots ** | |
** of Packed Float64 Values with relative error of less than 2^(-14)** | |
** VRSQRT14SD Compute Approximate Reciprocal of Square Root ** | |
** of Scalar Float64 Value with relative error of less than 2^(-14) ** | |
** VRSQRT14PS Compute Approximate Reciprocals of Square Roots ** | |
** of Packed Float32 Values with relative error of less than 2^(-14)** | |
** VRSQRT14SS Compute Approximate Reciprocal of Square Root ** | |
** of Scalar Float32 Value with relative error of less than 2^(-14) ** | |
** ** | |
** THE CORRESPONDING EMULATION ROUTINES (ONLY SCALAR VERSIONS) ARE: ** | |
** RCP14S - 14-BIT RECIPROCAL APPROXIMATION FOR Float32 ** | |
** RCP14D - 14-BIT RECIPROCAL APPROXIMATION FOR Float64 ** | |
** RSQRT14S - 14-BIT RECIPROCAL SQUARE ROOT APPROXIMATION FOR Float32 ** | |
** RSQRT14D - 14-BIT RECIPROCAL SQUARE ROOT APPROXIMATION FOR Float64 ** | |
**************************************************************************** | |
**************************************************************************** | |
*/ | |
#include <math.h> | |
typedef union { | |
unsigned int u; | |
float f; | |
} type32; | |
typedef union { | |
unsigned long long u; | |
double f; | |
} type64; | |
#define FP32_SIGN_BIT 0x80000000 | |
#define FP32_EXP_MASK 0x7f800000 | |
#define FP32_SIGNIF_MASK 0x007fffff | |
#define FP32_HIDDEN_BIT 0x00800000 | |
#define FP32_QNAN_BIT 0x00400000 | |
#define FP32_DEFAULT_NAN_AS_UINT32 0xffc00000 | |
#define FP32_PLUS_ZERO_AS_UINT32 0x00000000 | |
#define FP32_PLUS_ONE_AS_UINT32 0x3f800000 | |
#define FP32_PLUS_TWO_AS_UINT32 0x40000000 | |
#define FP32_SMALL_DEN_THRESHOLD 0x00200000 | |
#define FP32_SIGNIF_SHIFT_THRESHOLD 0x00800001 | |
#define FP32_LOW_31_BITS 0x7fffffff | |
#define FP32_SIGN_AND_TOP_OF_SIGNIF 0x80700000 | |
#define FP32_SIGN_AND_SIGNIF 0x807fffff | |
#define FP32_CLEAR_LOW_8_BITS 0xffffff00 | |
#define FP32_CLEAR_LOW_7_BITS 0xffffff80 | |
#define FP32_BIT_15 0x00008000 | |
#define FP32_BIT_6 0x00000040 | |
#define FP64_SIGN_BIT 0x8000000000000000ull | |
#define FP64_EXP_MASK 0x7ff0000000000000ull | |
#define FP64_SIGNIF_MASK 0x000fffffffffffffull | |
#define FP64_HIDDEN_BIT 0x0010000000000000ull | |
#define FP64_QNAN_BIT 0x0008000000000000ull | |
#define FP64_DEFAULT_NAN_AS_UINT64 0xfff8000000000000ull | |
#define FP64_PLUS_ZERO_AS_UINT64 0x0000000000000000ull | |
#define FP64_PLUS_ONE_AS_UINT64 0x3ff0000000000000ull | |
#define FP64_PLUS_TWO_AS_UINT64 0x4000000000000000ull | |
#define FP64_SMALL_DEN_THRESHOLD 0x0004000000000000ull | |
#define FP64_SIGNIF_SHIFT_THRESHOLD 0x0010000000000001ull | |
#define FP64_EXPON_DELTA_18 0x0120000000000000ull | |
#define FP64_EXPON_DELTA_19 0x0130000000000000ull | |
#define FP64_LOW_63_BITS 0x7fffffffffffffffull | |
#define FP64_SIGN_AND_TOP_OF_SIGNIF 0x800e000000000000ull | |
#define FP64_SIGN_AND_SIGNIF 0x800fffffffffffffull | |
#define FP64_CLEAR_LOW_37_BITS 0xffffffe000000000ull | |
#define FP64_CLEAR_LOW_36_BITS 0xfffffff000000000ull | |
/* | |
**************************************************************************** | |
**************************************************************************** | |
********************** EMULATION ROUTINES FOR RCP14 ************************ | |
**************************************************************************** | |
**************************************************************************** | |
*/ | |
// These reference functions have to be compiled in Linux with DAZ and FTZ | |
// off (e.g. with the Intel compiler: icc -c -no-ftz RECIP14.c). They have to | |
// be run with the rounding mode set to round-to-nearest, and with | |
// floating-point exceptions masked. | |
/* EXAMPLE: icc -no-ftz main.c RECIP14.c, where main.c is shown below | |
#include <stdio.h> | |
typedef union { | |
unsigned int u; | |
float f; | |
} type32; | |
typedef union { | |
unsigned long long u; | |
double f; | |
} type64; | |
extern void RCP14S (unsigned int mxcsr, type32 *dst, type32 src); | |
extern void RCP14D (unsigned int mxcsr, type64 *dst, type64 src); | |
main () { | |
type32 dst32, src32; | |
type64 dst64, src64; | |
unsigned int mxcsr = 0x00000000; | |
printf ("MXCSR = %8.8x\n", mxcsr); | |
src32.f = 3.0; | |
RCP14S (mxcsr, &dst32, src32); | |
printf ("RCP14S(%f = %8.8x HEX) = (%f = %8.8x HEX)\n", src32.f, src32.u, | |
dst32.f, dst32.u); | |
src64.f = 3.0; | |
RCP14D (mxcsr, &dst64, src64); | |
printf ("RCP14D(%f = %16.16llx HEX) = (%f = %16.16llx HEX)\n", src64.f, | |
src64.u, dst64.f, dst64.u); | |
return (0); | |
} | |
*/ | |
// Note on the RCP14 algorithm, which calculates | |
// rcp14(x)=reciprocal approximation of x: | |
// if (x is in (1.0, 2.0)) | |
// i = int_floor((x-1)*64) | |
// y[i] = 1 + 1/128 + i/64 | |
// rcp14(x) = truncate_to_17_ms_bits(a[i]-b[i]*(x-y[i])) | |
// a[i] = RCP14_Coeff[2*i+1] 18 bits; b[i] = RCP14_Coeff[2*i] 10 bits (at most) | |
unsigned int RCP14_Coeff[128] = { | |
1009, 260119, 977, 256148, 949, 252296, 921, 248558, | |
893, 244929, 869, 241405, 843, 237981, 821, 234652, | |
797, 231416, 777, 228266, 755, 225202, 735, 222220, | |
717, 219314, 699, 216485, 681, 213727, 663, 211038, | |
647, 208417, 631, 205859, 617, 203364, 601, 200929, | |
587, 198551, 573, 196229, 561, 193960, 547, 191743, | |
535, 189576, 523, 187458, 513, 185387, 501, 183360, | |
491, 181377, 479, 179439, 469, 177540, 459, 175681, | |
451, 173860, 441, 172077, 433, 170330, 423, 168618, | |
415, 166940, 407, 165295, 399, 163682, 391, 162101, | |
385, 160550, 377, 159027, 369, 157535, 363, 156069, | |
357, 154631, 349, 153219, 343, 151832, 337, 150470, | |
331, 149133, 325, 147819, 319, 146528, 315, 145260, | |
309, 144012, 303, 142787, 299, 141582, 293, 140397, | |
289, 139232, 285, 138085, 279, 136959, 275, 135853, | |
271, 134763, 267, 133689, 263, 132631, 259, 131589 | |
}; | |
void | |
RCP14S (unsigned int mxcsr, type32 *dst, type32 arg) { | |
unsigned int i, a, b; // i, a[i], b[i] - used when 1 < x < 2 | |
float x, y, xmy, rcp14; // x (input), y, x - y, rcp14 (result) | |
double da, db, dxmy, drcp14, dpr; // double versions, for exact computations | |
unsigned long long ui64; // used for bit manipulation of double values | |
unsigned int ui32;// used for bit manipulation of float values | |
unsigned int bexp; // biased exponent | |
unsigned int sign; // sign bit | |
unsigned int signif; // significand | |
type32 dst1, arg1; // destination and argument for recuresive calls | |
unsigned int ftz = mxcsr & FP32_BIT_15; // bit 15 of MXCSR | |
unsigned int daz = mxcsr & FP32_BIT_6; // bit 6 of MXCSR | |
// the following uses the IEEE 754-2008 encoding for 32-bit and 64-bit binary | |
// floating-point values | |
x = fabs (arg.f); // absolute value of the argument | |
if (isnan (x)) { // if the argument is NaN | |
dst->u = arg.u | FP32_QNAN_BIT; // rcp14 = quietized input NaN | |
} else if (isinf (x)) { // if the argument is infinity | |
dst->u = arg.u & FP32_SIGN_BIT; // rcp14 = zero of the same sign | |
} else if ((arg.u & FP32_LOW_31_BITS) == 0x0) { // if the argument is zero | |
dst->u = (arg.u & FP32_SIGN_BIT) | FP32_EXP_MASK; | |
// rcp14 = infinity of same sign | |
} else if (!isnormal (x) && daz) { | |
dst->u = (arg.u & FP32_SIGN_BIT) | FP32_EXP_MASK; | |
// rcp14 = infinity of same sign | |
} else if (!isnormal (x) && (arg.u & FP32_LOW_31_BITS) <= FP32_SMALL_DEN_THRESHOLD) { | |
// if the argument is a 'small' denormal | |
dst->u = (arg.u & FP32_SIGN_BIT) | FP32_EXP_MASK; | |
// rcp14 = infinity of same sign | |
} else if (!isnormal (x)) { // 'large' denormal and at least 0x00200001 in | |
// magnitude; rcp14 does not overflow | |
bexp = 0xfc; // start with 125 as the biased exponent for the reciprocal | |
// (one below 126) | |
sign = arg.u & FP32_SIGN_BIT; | |
arg1.u = arg.u & FP32_SIGNIF_MASK; // remove sign and exponent, to prepare | |
// for normalization | |
while ((arg1.u & FP32_HIDDEN_BIT) == 0x0) { | |
arg1.u = arg1.u << 1; | |
bexp++; | |
} | |
arg1.u = (arg1.u & FP32_SIGNIF_MASK) | FP32_PLUS_ONE_AS_UINT32; | |
// bring argument between | |
// 1.0 and 2.0 | |
RCP14S (mxcsr, &dst1, arg1); | |
if (dst1.u == FP32_PLUS_ONE_AS_UINT32) bexp++; | |
// to account for the fact that the reciprocal is not < 1.0 in this case | |
dst->u = sign | (bexp << 23) | (dst1.u & FP32_SIGNIF_MASK); // rcp14 | |
} else if ((arg.u & FP32_SIGNIF_MASK) == 0x0) { | |
// if it is an exact power of 2 | |
bexp = (arg.u & FP32_EXP_MASK) >> 23; | |
if (bexp == 0xfe) { // denormal reciprocal | |
dst->u = (arg.u & FP32_SIGN_BIT) | FP32_QNAN_BIT; // rcp14 | |
if (ftz) dst->u = dst->u & FP32_SIGN_BIT; // zero of the same sign | |
} else { // rcp14 is not a denormal | |
bexp = (0x7f - (bexp - 0x7f)) << 23; | |
dst->u = (arg.u & FP32_SIGN_AND_TOP_OF_SIGNIF) | bexp; // rcp14 | |
} | |
} else if ((arg.u & FP32_EXP_MASK) == FP32_PLUS_ONE_AS_UINT32) { | |
// if 1 < |x| < 2) | |
sign = arg.u & FP32_SIGN_BIT; | |
// a[i] = RCP14_Coeff[2*i+1]; b[i] = RCP14_Coeff[2*i] | |
i = floor ((x - 1.0) * 64); // exact | |
y = 1.0 + 1.0/128 + i/64.0; // 8 bits (exact) | |
ui32 = *((unsigned int *)&x); // x viewed as an unsigned int | |
ui32 = ui32 & FP32_CLEAR_LOW_7_BITS; // remove the 7 least significant bits | |
x = *((float *)&ui32); // x with significand truncated to 17 bits | |
xmy = x - y; // fits in 23 bits; exact | |
a = RCP14_Coeff[2*i+1]; // b[i], up to 18 bits | |
b = RCP14_Coeff[2*i]; // a[i], up to 10 bits | |
da = (double)a; // double precision versions of a[i] and b[i], for exact | |
// computation | |
db = 256 * (double)b; // multiply by 2^8 to make up for the difference | |
// between 18 bits in a and 10 bits in b | |
dxmy = (double)xmy; // x - y as a double | |
dpr = (db * dxmy); // product b[i] * (x - y), exact in double precision | |
drcp14 = da - dpr; // rcp14 = a[i] - b[i] * (x - y), exact in | |
// double precision | |
ui64 = *((unsigned long long *)&drcp14); // rcp14, viewed as a | |
// 64-bit integer | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; // truncate to 28 bits = | |
// 1-bit sign + 11-bit exp + 16-bit fraction | |
ui64 = ui64 - FP64_EXPON_DELTA_18; // divide by 2^18 | |
drcp14 = *((double *)&ui64); // rcp14 in double precision format | |
rcp14 = (float)drcp14; // rcp14 is single precision; exact conversion | |
dst->f = rcp14; | |
dst->u = sign | dst->u; // rcp14 | |
} else { // normal not between 1.0 and 2.0 in absolute value, | |
// and not a power of 2 | |
sign = arg.u & FP32_SIGN_BIT; | |
bexp = (arg.u & FP32_EXP_MASK) >> 23; | |
signif = FP32_HIDDEN_BIT | (arg.u & FP32_SIGNIF_MASK); | |
if (bexp == 0xfe) { // denormal reciprocal, and the significand | |
// must be shifted by 2 | |
arg1.u = (arg.u & FP32_SIGN_AND_SIGNIF) | FP32_PLUS_ONE_AS_UINT32; | |
// bring argument between 1.0 and 2.0 | |
RCP14S (mxcsr, &dst1, arg1); | |
signif = FP32_HIDDEN_BIT | (dst1.u & FP32_SIGNIF_MASK); | |
dst->u = (dst1.u & FP32_SIGN_BIT) | (signif >> 2); // rcp14; the new | |
// biased exponent is 0x0 | |
} else if (bexp == 0xfd && signif >= FP32_SIGNIF_SHIFT_THRESHOLD) { | |
// denormal reciprocal, | |
// and the significand must be shifted by 1 | |
arg1.u = (arg.u & FP32_SIGN_AND_SIGNIF) | FP32_PLUS_ONE_AS_UINT32; | |
// bring argument between 1.0 and 2.0 | |
RCP14S (mxcsr, &dst1, arg1); | |
signif = FP32_HIDDEN_BIT | (dst1.u & FP32_SIGNIF_MASK); | |
dst->u = (dst1.u & FP32_SIGN_BIT) | (signif >> 1); // rcp14; the new | |
// biased exponent is 0x0 | |
} else { | |
bexp = 0xfe - bexp; | |
arg1.u = (arg.u & FP32_SIGN_AND_SIGNIF) | FP32_PLUS_ONE_AS_UINT32; | |
// bring argument between 1.0 and 2.0 | |
RCP14S (mxcsr, &dst1, arg1); | |
dst->u = (dst1.u & FP32_SIGN_AND_SIGNIF) | ((bexp - 1) << 23); // rcp14 | |
} | |
if (!isnormal (dst->f) && ftz) { | |
dst->u = dst->u & FP32_SIGN_BIT; // zero of the same sign | |
} | |
} | |
} | |
void | |
RCP14D (unsigned int mxcsr, type64 *dst, type64 arg) { | |
unsigned int i, a, b; // i, a[i], b[i] - used when 1 < x < 2 | |
double x, y, xmy, rcp14; // x (input), y, x - y, rcp14 (result) | |
double da, db, dpr; // double versions for exact computations | |
unsigned long long ui64; // used for bit manipulation of double values | |
unsigned long long int bexp; // biased exponent | |
unsigned long long int sign; // sign bit | |
unsigned int long long signif; // significand | |
type64 dst1, arg1; // destination and argument for recuresive calls | |
unsigned int ftz = mxcsr & FP32_BIT_15; // bit 15 of MXCSR | |
unsigned int daz = mxcsr & FP32_BIT_6; // bit 6 of MXCSR | |
// the following uses the IEEE 754-2008 encoding for 32-bit and 64-bit | |
// binary floating-point values | |
x = fabs (arg.f); // absolute value of the argument | |
if (isnan (x)) { // if the argument is NaN | |
dst->u = arg.u | FP64_QNAN_BIT; // rcp14 = quietized input NaN | |
} else if (isinf (x)) { // if the argument is infinity | |
dst->u = arg.u & FP64_SIGN_BIT; // rcp14 = zero of the same sign | |
} else if ((arg.u & FP64_LOW_63_BITS) == FP64_PLUS_ZERO_AS_UINT64) { | |
// if the arg. is zero | |
dst->u = (arg.u & FP64_SIGN_BIT) | FP64_EXP_MASK; // rcp14 | |
// = infinity of the same sign | |
} else if (!isnormal (x) && daz) { | |
dst->u = (arg.u & FP64_SIGN_BIT) | FP64_EXP_MASK; | |
// rcp14 = infinity of same sign | |
} else if (!isnormal (x) && (arg.u & FP64_LOW_63_BITS) <= | |
FP64_SMALL_DEN_THRESHOLD) { // if the argument is a 'small' denormal | |
dst->u = (arg.u & FP64_SIGN_BIT) | FP64_EXP_MASK; | |
// rcp14 = infinity of the same sign | |
} else if (!isnormal (x)) { // 'large' denormal and at least | |
// 0x0004000000000001ull in magnitude; rcp14 does not overflow | |
bexp = 0x7fc; // start with 1021 as the biased exponent for the reciprocal | |
// (one below 1022) | |
sign = arg.u & FP64_SIGN_BIT; | |
arg1.u = arg.u & FP64_SIGNIF_MASK; // remove sign and exponent, to | |
// prepare for normalization | |
while ((arg1.u & FP64_HIDDEN_BIT) == 0x0) { | |
arg1.u = arg1.u << 1; | |
bexp++; | |
} | |
arg1.u = (arg1.u & FP64_SIGNIF_MASK) | FP64_PLUS_ONE_AS_UINT64; | |
// bring argument between 1.0 and 2.0 | |
RCP14D (mxcsr, &dst1, arg1); | |
if (dst1.u == FP64_PLUS_ONE_AS_UINT64) bexp++; // to account for the fact | |
// that the reciprocal is not < 1.0 in this case | |
dst->u = sign | (bexp << 52) | (dst1.u & FP64_SIGNIF_MASK); // rcp14 | |
} else if ((arg.u & FP64_SIGNIF_MASK) == 0x0) { // if it is | |
// an exact power of 2 | |
bexp = (arg.u & FP64_EXP_MASK) >> 52; | |
if (bexp == 0x7fe) { // denormal reciprocal | |
dst->u = (arg.u & FP64_SIGN_BIT) | FP64_QNAN_BIT; // rcp14 | |
if (ftz) dst->u = dst->u & FP32_SIGN_BIT; // zero of the same sign | |
} else { // rcp14 is not a denormal | |
bexp = (0x3ff - (bexp - 0x3ff)) << 52; | |
dst->u = (arg.u & FP64_SIGN_AND_TOP_OF_SIGNIF) | bexp; // rcp14 | |
} | |
} else if ((arg.u & FP64_EXP_MASK) == FP64_PLUS_ONE_AS_UINT64) { | |
// // if 1 < |x| < 2) | |
sign = arg.u & FP64_SIGN_BIT; | |
// a[i] = RCP14_Coeff[2*i+1]; b[i] = RCP14_Coeff[2*i] | |
i = floor ((x - 1.0) * 64); // exact | |
y = 1.0 + 1.0/128 + i/64.0; // 8 bits (exact) | |
ui64 = *((unsigned long long int *)&x); // x viewed as an unsigned int | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; | |
// remove the 36 least significant bits | |
x = *((double *)&ui64); // x with significand truncated to 17 bits | |
xmy = x - y; // fits in 23 bits; exact | |
a = RCP14_Coeff[2*i+1]; // b[i], up to 18 bits | |
b = RCP14_Coeff[2*i]; // a[i], up to 10 bits | |
da = (double)a; // double precision versions of a[i] and b[i], for exact | |
// computation | |
db = 256 * (double)b; // multiply by 2^8 to make up for the difference | |
// between 18 bits in a and 10 bits in b | |
dpr = (db * xmy); // product b[i] * (x - y), exact in double precision | |
rcp14 = da - dpr; | |
// rcp14 = a[i] - b[i] * (x - y), exact in double precision | |
ui64 = *((unsigned long long *)&rcp14); // rcp14, viewed as a 64-bit integer | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; // truncate to 28 bits = | |
// 1-bit sign + 11-bit exp + 16-bit fraction | |
ui64 = ui64 - FP64_EXPON_DELTA_18; // divide by 2^18 | |
rcp14 = *((double *)&ui64); // rcp14 in double precision format | |
dst->f = rcp14; | |
dst->u = sign | dst->u; // rcp14 | |
} else { | |
// normal not between 1.0 and 2.0 in absolute value, and not a power of 2 | |
sign = arg.u & FP64_SIGN_BIT; | |
bexp = (arg.u & FP64_EXP_MASK) >> 52; | |
signif = FP64_HIDDEN_BIT | (arg.u & FP64_SIGNIF_MASK); | |
if (bexp == 0x7fe) { // denormal reciprocal, and the significand must be | |
//shifted by 2 | |
arg1.u = (arg.u & FP64_SIGN_AND_SIGNIF) | FP64_PLUS_ONE_AS_UINT64; | |
// bring argument between 1.0 and 2.0 | |
RCP14D (mxcsr, &dst1, arg1); | |
signif = FP64_HIDDEN_BIT | (dst1.u & FP64_SIGNIF_MASK); | |
dst->u = (dst1.u & FP64_SIGN_BIT) | (signif >> 2); | |
// rcp14; the new biased exponent is 0x0 | |
} else if (bexp == 0x7fd && signif >= FP64_SIGNIF_SHIFT_THRESHOLD) { | |
// denormal reciprocal, and the significand must be shifted by 1 | |
arg1.u = (arg.u & FP64_SIGN_AND_SIGNIF) | FP64_PLUS_ONE_AS_UINT64; | |
// bring argument between 1.0 and 2.0 | |
RCP14D (mxcsr, &dst1, arg1); | |
signif = FP64_HIDDEN_BIT | (dst1.u & FP64_SIGNIF_MASK); | |
dst->u = (dst1.u & FP64_SIGN_BIT) | (signif >> 1); | |
// rcp14; the new biased exponent is 0x0 | |
} else { | |
bexp = 0x7fe - bexp; | |
arg1.u = (arg.u & FP64_SIGN_AND_SIGNIF) | FP64_PLUS_ONE_AS_UINT64; | |
// bring argument between 1.0 and 2.0 | |
RCP14D (mxcsr, &dst1, arg1); | |
dst->u = (dst1.u & FP64_SIGN_AND_SIGNIF) | ((bexp - 1) << 52); // rcp14 | |
} | |
if (!isnormal (dst->f) && ftz) { | |
dst->u = dst->u & FP64_SIGN_BIT; // zero of the same sign | |
} | |
} | |
} | |
/* | |
**************************************************************************** | |
**************************************************************************** | |
********************** EMULATION ROUTINES FOR RSQRT14 ********************** | |
**************************************************************************** | |
**************************************************************************** | |
*/ | |
// These reference functions have to be compiled in Linux with DAZ and FTZ off | |
// (e.g. with the Intel compiler: icc -c -no-ftz RECIP14.c). They have to be run | |
// with the rounding mode set to round-to-nearest, and with floating-point | |
// exceptions masked. | |
/* EXAMPLE: icc -no-ftz main.c RECIP14.c, where main.c is shown below | |
#include <stdio.h> | |
typedef union { | |
unsigned int u; | |
float f; | |
} type32; | |
typedef union { | |
unsigned long long u; | |
double f; | |
} type64; | |
extern void RSQRT14S (unsigned int mxcsr, type32 *dst, type32 src); | |
extern void RSQRT14D (unsigned int mxcsr, type64 *dst, type64 src); | |
main () { | |
type32 dst32, src32; | |
type64 dst64, src64; | |
unsigned int mxcsr = 0x00000000; | |
printf ("MXCSR = %8.8x\n", mxcsr); | |
src32.f = 2.0; | |
RSQRT14S (mxcsr, &dst32, src32); | |
printf ("RSQRT14S(%f = %8.8x HEX) = (%f = %8.8x HEX)\n", src32.f, src32.u, | |
dst32.f, dst32.u); | |
src64.f = 2.0; | |
RSQRT14D (mxcsr, &dst64, src64); | |
printf ("RSQRT14D(%f = %16.16llx HEX) = (%f = %16.16llx HEX)\n", src64.f, | |
src64.u, dst64.f, dst64.u); | |
return (0); | |
} | |
*/ | |
// Note on the RSQRT14 algorithm, which calculates | |
// rsqrt14(x) = reciprocal approximation of x: | |
// if (x is in [1.0, 2.0)) | |
// i = int_floor((x-1)*32) | |
// y[i] = 1 + 1/64 + i/32 | |
// rsqrt14(x) = truncate_to_17_ms_bits(c[i]-d[i]*(x-y[i])) | |
// else if (x is in [2.0, 4.0)) | |
// i = int_floor((x-2)*16) | |
// y[i] = 2 + 1/32 + i/16 | |
// rsqrt14(x) = truncate_to_17_ms_bits(c[i]-d[i]*(x-y[i])) | |
// c[i] = RSQRT14_Coeff[2*i+1] 19 bits; d[i] = RSQRT14_Coeff[2*i] 10 bits; | |
// coeff*1024 freeTerm*524288 | |
unsigned int RSQRT14_Coeff[128] = { | |
1001, 520261, 707, 367881, 955, 512437, 675, 362349, | |
915, 504953, 647, 357056, 877, 497790, 619, 351992, | |
841, 490922, 595, 347136, 807, 484331, 571, 342475, | |
775, 478001, 549, 337997, 747, 471909, 527, 333693, | |
719, 466046, 509, 329545, 693, 460397, 491, 325551, | |
669, 454947, 473, 321697, 647, 449688, 457, 317977, | |
625, 444606, 441, 314385, 603, 439694, 427, 310910, | |
585, 434939, 413, 307549, 567, 430335, 401, 304295, | |
549, 425875, 389, 301139, 533, 421551, 377, 298079, | |
517, 417355, 365, 295115, 501, 413284, 355, 292237, | |
487, 409329, 345, 289439, 473, 405487, 335, 286722, | |
461, 401748, 325, 284080, 449, 398111, 317, 281508, | |
437, 394571, 309, 279006, 425, 391127, 301, 276569, | |
415, 387770, 293, 274195, 403, 384498, 285, 271882, | |
393, 381307, 279, 269625, 385, 378194, 271, 267425, | |
375, 375155, 265, 265276, 367, 372190, 259, 263178 | |
}; | |
void | |
RSQRT14S (unsigned int mxcsr, type32 *dst, type32 arg) { | |
unsigned int i, c, d; // i, c[i], d[i] - used when 1 <= x < 4 | |
float x, y, xmy, rsqrt14; // x (input), y, x - y, recp14 (result) | |
double dc, dd, dxmy, drsqrt14, dpr; // double versions for exact computations | |
unsigned long long ui64; // used for bit manipulation of double values | |
unsigned int ui32;// used for bit manipulation of float values | |
int expon; // unbiased exponent | |
int n; // scaling factor | |
unsigned int signif; // significand | |
unsigned int daz = mxcsr & FP32_BIT_6; // bit 6 of MXCSR | |
unsigned int sign = arg.u & FP32_SIGN_BIT; // the sign bit, also 0.0 with the sign of arg.f | |
// apply daz if necessary | |
if (daz && ((arg.u & FP32_LOW_31_BITS) < FP32_HIDDEN_BIT)) { | |
arg.u = sign; // zero with sign of arg | |
} | |
// the following makes use of the IEEE 754-2008 encoding for 32-bit and 64-bit | |
// binary floating-point values | |
x = arg.f; | |
if (isnan (x)) { | |
dst->u = arg.u | FP32_QNAN_BIT; // rsqrt14 = quietized NaN | |
} else if (x < 0.0) { | |
dst->u = FP32_DEFAULT_NAN_AS_UINT32; // rsqrt14 = QNaN Indefinite | |
} else if (isinf (x)) { | |
dst->u = 0; // rsqrt14(+inf) = +zero | |
} else if ((arg.u & FP32_LOW_31_BITS) == 0x0) { // zero | |
dst->u = sign | FP32_EXP_MASK; | |
// rsqrt14 = infinity of same sign | |
} else if (arg.u == FP32_PLUS_ONE_AS_UINT32) { // if x is 1.0 | |
dst->u = arg.u; // rsqrt14 = 1.0 | |
} else if ((arg.u & FP32_EXP_MASK) == FP32_PLUS_ONE_AS_UINT32) { | |
// if 1 < |x| < 2) c[i] = RSQRT14_Coeff[4*i+1]; d[i] = RSQRT14_Coeff[4*i] | |
i = floor ((x - 1.0) * 32); // exact; 32 different values | |
y = 1.0 + 1.0/64 + i/32.0; // 8 bits (exact) | |
ui32 = *((unsigned int *)&x); // x viewed as an unsigned int | |
ui32 = ui32 & FP32_CLEAR_LOW_8_BITS; // remove the 8 least significant bits | |
x = *((float *)&ui32); // x with significand truncated to 17 bits | |
xmy = x - y; // fits in 23 bits; exact | |
c = RSQRT14_Coeff[4*i+1]; // c[i], up to 18 bits | |
d = RSQRT14_Coeff[4*i]; // d[i], up to 10 bits | |
dc = (double)c; // double precision versions of c[i] and d[i], | |
// for exact computation | |
dd = 256 * (double)d; // multiply by 2^8 to make up for the difference | |
// between 18 bits in a and 10 bits in b | |
dxmy = (double)xmy; // x - y as a double | |
dpr = (dd * dxmy); // product d[i] * (x - y), exact in double precision | |
drsqrt14 = dc - dpr; // rsqrt14 = c[i] - d[i] * (x - y), exact in | |
// double precision | |
ui64 = *((unsigned long long *)&drsqrt14); // rsqrt14, viewed as | |
// a 64-bit integer | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; // truncate to 28 bits = | |
// 1-bit sign + 11-bit exp + 16-bit fraction | |
ui64 = ui64 - FP64_EXPON_DELTA_19; // divide by 2^19 | |
drsqrt14 = *((double *)&ui64); // rsqrt14 in double precision format | |
rsqrt14 = (float)drsqrt14; // rsqrt14 is single precision; exact conversion | |
dst->f = rsqrt14; | |
} else if ((arg.u & FP32_EXP_MASK) == FP32_PLUS_TWO_AS_UINT32) { | |
// if 2 <= |x| < 4) c[i]=RSQRT14_Coeff[4*i+3]; d[i]=RSQRT14_Coeff[4*i+2] | |
i = floor ((x - 2.0) * 16); // exact; 32 different values | |
y = 2.0 + 1.0/32 + i/16.0; // 8 bits (exact) | |
ui32 = *((unsigned int *)&x); // x viewed as an unsigned int | |
ui32 = ui32 & FP32_CLEAR_LOW_8_BITS; // remove the 8 least significant bits | |
x = *((float *)&ui32); // x with significand truncated to 17 bits | |
xmy = x - y; // fits in 23 bits; exact | |
c = RSQRT14_Coeff[4*i+3]; // c[i], up to 18 bits | |
d = RSQRT14_Coeff[4*i+2]; // d[i], up to 10 bits | |
dc = (double)c; // double precision versions of c[i] and d[i], | |
// for exact computation | |
dd = 128 * (double)d; // multiply by 2^7 to make up for the difference | |
// between 18 bits in a and 11 bits in b | |
dxmy = (double)xmy; // x - y as a double | |
dpr = (dd * dxmy); // product d[i] * (x - y), exact in double precision | |
drsqrt14 = dc - dpr; // rsqrt14 = c[i] - d[i] * (x - y), | |
// exact in double precision | |
ui64 = *((unsigned long long *)&drsqrt14); // rsqrt14, viewed as | |
// a 64-bit integer | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; // truncate to 28 bits | |
// = 1-bit sign + 11-bit exp + 16-bit fraction | |
ui64 = ui64 - FP64_EXPON_DELTA_19; // divide by 2^19 | |
drsqrt14 = *((double *)&ui64); // rsqrt14 in double precision format | |
rsqrt14 = (float)drsqrt14; // rsqrt14 is single precision; exact conversion | |
dst->f = rsqrt14; | |
} else if ((arg.u & FP32_EXP_MASK) == FP32_PLUS_ZERO_AS_UINT32) { | |
// denormal; find n such that 1<=2^(2n)*x<4 | |
expon = -126; | |
signif = arg.u & FP32_SIGNIF_MASK; | |
while ((signif & FP32_HIDDEN_BIT) == 0x0) { // normalize significand and | |
// adjust exponent | |
signif = signif << 1; | |
expon--; | |
} | |
if (expon & 0x01) { // odd exponent; scaled x will be between 2.0 and 4.0 | |
n = (expon - 1) / 2; // odd expon in -149, -127 | |
arg.u = FP32_PLUS_TWO_AS_UINT32 | (signif & FP32_SIGNIF_MASK); | |
// scaled x between 2.0 and 4.0 | |
RSQRT14S (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 23); // rsqrt14 | |
} else { // even exponent, scaled x between 1.0 and 2.0 | |
n = expon / 2; // even expon in -149, -128 | |
arg.u = FP32_PLUS_ONE_AS_UINT32 | (signif & FP32_SIGNIF_MASK); | |
// scaled x between 1.0 and 2.0 | |
RSQRT14S (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 23); // rsqrt14 | |
} | |
} else { // normal not between 1.0 and 4.0; find n such that 1<=2^(2n)*x<4 | |
expon = ((arg.u & FP32_EXP_MASK) >> 23) - 0x7f; | |
signif = (arg.u & FP32_SIGNIF_MASK) | FP32_HIDDEN_BIT; | |
if (expon & 0x01) { // odd exponent, scaled x between 2.0 and 4.0 | |
n = (expon - 1) / 2; // odd expon in -126, 127 | |
arg.u = FP32_PLUS_TWO_AS_UINT32 | (signif & FP32_SIGNIF_MASK); | |
// scaled x between 2.0 and 4.0 | |
RSQRT14S (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 23); // rsqrt14 | |
} else { // even exponent, scaled x between 1.0 and 2.0 | |
n = expon / 2; // even expon in -126, 127 | |
arg.u = FP32_PLUS_ONE_AS_UINT32 | (signif & FP32_SIGNIF_MASK); | |
// scaled x between 1.0 and 2.0 | |
RSQRT14S (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 23); // rsqrt14 | |
} | |
} | |
} | |
void | |
RSQRT14D (unsigned int mxcsr, type64 *dst, type64 arg) { | |
unsigned int i, c, d; // i, c[i], d[i] - used when 1 <= x < 4 | |
double x, y, xmy, rsqrt14; // x (input), y, x - y, recp14 (result) | |
double dc, dd, dpr; // double versions for exact computations | |
unsigned long long ui64; // used for bit manipulation of double values | |
long long int expon; // unbiased exponent | |
long long int n; // scaling factor | |
unsigned long long int signif; // significand | |
unsigned int daz = mxcsr & FP32_BIT_6; // bit 6 of MXCSR | |
unsigned long long sign = arg.u & FP64_SIGN_BIT; // the sign bit | |
// apply daz if necessary | |
if (daz && ((arg.u & FP64_LOW_63_BITS) < FP64_HIDDEN_BIT)) { | |
arg.u = sign; // zero with sign of arg | |
} | |
// the following makes use of the IEEE 754-2008 encoding for 32-bit and 64-bit | |
// binary floating-point values | |
x = arg.f; | |
if (isnan (x)) { | |
dst->u = arg.u | FP64_QNAN_BIT; // rsqrt14 = quietized NaN | |
} else if (x < 0.0) { | |
dst->u = FP64_DEFAULT_NAN_AS_UINT64; // rsqrt14 = QNaN Indefinite | |
} else if (isinf (x)) { | |
dst->u = 0; // rsqrt14(+inf) = +zero | |
} else if ((arg.u & FP64_LOW_63_BITS) == 0x0) { // zero | |
dst->u = sign | FP64_EXP_MASK; | |
// rsqrt14 = infinity of same sign | |
} else if (arg.u == FP64_PLUS_ONE_AS_UINT64) { // if x is 1.0 | |
dst->u = arg.u; // rsqrt14 = 1.0 | |
} else if ((arg.u & FP64_EXP_MASK) == FP64_PLUS_ONE_AS_UINT64) { | |
// if 1 < |x| < 2) | |
// c[i] = RSQRT14_Coeff[4*i+1]; d[i] = RSQRT14_Coeff[4*i] | |
i = floor ((x - 1.0) * 32); // exact; 32 different values | |
y = 1.0 + 1.0/64 + i/32.0; // 8 bits (exact) | |
ui64 = *((unsigned long long int *)&x); // x viewed as an unsigned int | |
ui64 = ui64 & FP64_CLEAR_LOW_37_BITS ; | |
// remove the 37 least significant bits | |
x = *((double *)&ui64); // x with significand truncated to 17 bits | |
xmy = x - y; // fits in 23 bits; exact | |
c = RSQRT14_Coeff[4*i+1]; // c[i], up to 18 bits | |
d = RSQRT14_Coeff[4*i]; // d[i], up to 10 bits | |
dc = (double)c; // double precision versions of c[i] and d[i], | |
// for exact computation | |
dd = 256 * (double)d; // multiply by 2^8 to make up for the difference | |
// between 18 bits in a and 10 bits in b | |
dpr = (dd * xmy); // product d[i] * (x - y), exact in double precision | |
rsqrt14 = dc - dpr; // rsqrt14 = c[i] - d[i] * (x - y), exact in | |
// double precision | |
ui64 = *((unsigned long long int *)&rsqrt14); // rsqrt14, viewed as | |
// a 64-bit integer | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; // truncate to 28 bits = | |
// 1-bit sign + 11-bit exp + 16-bit fraction | |
ui64 = ui64 - FP64_EXPON_DELTA_19; // divide by 2^19 | |
rsqrt14 = *((double *)&ui64); // rsqrt14 in double precision format | |
dst->f = rsqrt14; | |
} else if ((arg.u & FP64_EXP_MASK) == FP64_PLUS_TWO_AS_UINT64) { | |
// if 2 <= |x| < 4) | |
// c[i] = RSQRT14_Coeff[4*i+3]; d[i] = RSQRT14_Coeff[4*i+2] | |
i = floor ((x - 2.0) * 16); // exact; 32 different values | |
y = 2.0 + 1.0/32 + i/16.0; // 8 bits (exact) | |
ui64 = *((unsigned long long int *)&x); // x viewed as an unsigned int | |
ui64 = ui64 & FP64_CLEAR_LOW_37_BITS ; | |
// remove the 8 least significant bits | |
x = *((double *)&ui64); // x with significand truncated to 17 bits | |
xmy = x - y; // fits in 23 bits; exact | |
c = RSQRT14_Coeff[4*i+3]; // c[i], up to 18 bits | |
d = RSQRT14_Coeff[4*i+2]; // d[i], up to 11 bits | |
dc = (double)c; // double precision versions of c[i] and d[i], | |
// for exact computation | |
dd = 128 * (double)d; // multiply by 2^7 to make up for the difference | |
// between 18 bits in a and 11 bits in b | |
dpr = (dd * xmy); // product d[i] * (x - y), exact in double precision | |
rsqrt14 = dc - dpr; // rsqrt14 = c[i] - d[i] * (x - y), | |
// exact in double precision | |
ui64 = *((unsigned long long *)&rsqrt14); // rsqrt14, viewed as | |
// a 64-bit integer | |
ui64 = ui64 & FP64_CLEAR_LOW_36_BITS; // truncate to 28 bits = | |
// 1-bit sign + 11-bit exp + 16-bit fraction | |
ui64 = ui64 - FP64_EXPON_DELTA_19; // divide by 2^19 | |
rsqrt14 = *((double *)&ui64); // rsqrt14 in double precision format | |
dst->f = rsqrt14; | |
} else if ((arg.u & FP64_EXP_MASK) == FP64_PLUS_ZERO_AS_UINT64) { | |
// denormal; find n such that 1<=2^(2n)*x<4 | |
expon = -1022; | |
signif = arg.u & FP64_SIGNIF_MASK; | |
while ((signif & FP64_HIDDEN_BIT) == FP64_PLUS_ZERO_AS_UINT64) { | |
// normalize significand and adjust exponent | |
signif = signif << 1; | |
expon--; | |
} | |
if (expon & 0x01) { // odd exponent; scaled x will be between 2.0 and 4.0 | |
n = (expon - 1) / 2; // odd expon in -1074 -1023 | |
arg.u = FP64_PLUS_TWO_AS_UINT64 | (signif & FP64_SIGNIF_MASK); | |
// scaled x between 2.0 and 4.0 | |
RSQRT14D (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 52); // rsqrt14 | |
} else { // even exponent, scaled x between 1.0 and 2.0 | |
n = expon / 2; // even expon in -149, -128 | |
arg.u = FP64_PLUS_ONE_AS_UINT64 | (signif & FP64_SIGNIF_MASK); | |
// scaled x between 1.0 and 2.0 | |
RSQRT14D (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 52); // rsqrt14 | |
} | |
} else { // normal not between 1.0 and 4.0; find n such that 1<=2^(2n)*x<4 | |
expon = ((arg.u & FP64_EXP_MASK) >> 52) - 0x3ff; | |
signif = (arg.u & FP64_SIGNIF_MASK) | FP64_HIDDEN_BIT; | |
if (expon & 0x01) { // odd exponent, scaled x between 2.0 and 4.0 | |
n = (expon - 1) / 2; // odd expon in -1022, 1023 | |
arg.u = FP64_PLUS_TWO_AS_UINT64 | (signif & FP64_SIGNIF_MASK); | |
// scaled x between 2.0 and 4.0 | |
RSQRT14D (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 52); // rsqrt14 | |
} else { // even exponent, scaled x between 1.0 and 2.0 | |
n = expon / 2; // even expon in -126, 127 | |
arg.u = FP64_PLUS_ONE_AS_UINT64 | (signif & FP64_SIGNIF_MASK); | |
// scaled x between 1.0 and 2.0 | |
RSQRT14D (mxcsr, dst, arg); | |
dst->u = dst->u - (n << 52); // rsqrt14 | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment