Created
March 21, 2012 04:37
-
-
Save rygorous/2144419 to your computer and use it in GitHub Desktop.
half->float using SSE2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; input: 4x F16 in XMM0 (low words of each DWord) | |
; original idea+implementation by Dean Macri | |
; WARNING: copy & pasted together from other code, this ver is untested!! | |
; though the original version was definitely correct. | |
bits 32 | |
section .data | |
FP32_no_sign times 4 dd 0x7FFFFFFF | |
FP32_sign_bit times 4 dd 0x80000000 | |
FP16_FP32_sgn_adj times 4 dd 0x70000000 | |
FP16_FP32_sgn_adj2 times 4 dd 0x8FFFFFFF | |
FP16_FP32_denorm times 4 dd 0x38000000 | |
FP16_FP32_denorm_adj times 4 dd 0x00800000 | |
FP16_FP32_exp_adj times 4 dd 0x38000000 | |
FP16_exp_shifted times 4 dd 0x0F800000 | |
FP16_exp_adjust_for_NaN times 4 dd 0x7F800000 | |
FP16_exp_adjust_for_Zero times 4 dd 0x7F800000 | |
FP16_FP32_exp_adj_for_Zero dd 0x38000000, 0x38000000, 0x38000000, 0x38000000 | |
dd 0xB8000000, 0x38000000, 0x38000000, 0x38000000 | |
dd 0x38000000, 0xB8000000, 0x38000000, 0x38000000 | |
dd 0xB8000000, 0xB8000000, 0x38000000, 0x38000000 | |
dd 0x38000000, 0x38000000, 0xB8000000, 0x38000000 | |
dd 0xB8000000, 0x38000000, 0xB8000000, 0x38000000 | |
dd 0x38000000, 0xB8000000, 0xB8000000, 0x38000000 | |
dd 0xB8000000, 0xB8000000, 0xB8000000, 0x38000000 | |
dd 0x38000000, 0x38000000, 0x38000000, 0xB8000000 | |
dd 0xB8000000, 0x38000000, 0x38000000, 0xB8000000 | |
dd 0x38000000, 0xB8000000, 0x38000000, 0xB8000000 | |
dd 0xB8000000, 0xB8000000, 0x38000000, 0xB8000000 | |
dd 0x38000000, 0x38000000, 0xB8000000, 0xB8000000 | |
dd 0xB8000000, 0x38000000, 0xB8000000, 0xB8000000 | |
dd 0x38000000, 0xB8000000, 0xB8000000, 0xB8000000 | |
dd 0xB8000000, 0xB8000000, 0xB8000000, 0xB8000000 | |
section .text | |
f16tof32: | |
; Shift the mantissa to the correct place (bit 23 in F32 from bit 10 in F16) | |
pslld xmm0, 13 | |
; Get the sign bit set appropriately | |
paddd xmm0, [FP16_FP32_sgn_adj] | |
pand xmm0, [FP16_FP32_sgn_adj2] | |
; Save copy, adjust exponent | |
movdqa xmm1, xmm0 | |
paddd xmm0, [FP16_FP32_exp_adj] | |
; Check for NaNs, inf | |
pand xmm1, [FP16_exp_shifted] | |
pcmpeqd xmm1, [FP16_exp_shifted] | |
pand xmm1, [FP16_exp_adjust_for_NaN] | |
por xmm0, xmm1 | |
lea edx, [FP16_FP32_exp_adj_for_Zero] | |
; Check for zeros/denorms. This is a pain. We need to | |
; figure out which FP16 values had a zero value for the biased | |
; exponent. THEN, we have to subtract away the new exponent, | |
; so that if we had a denorm orginally, we'll get rid of the | |
; implicit one we created in the FP32 format. | |
; | |
movmskps eax, xmm0 | |
movdqa xmm1, xmm0 | |
shl eax, 4 | |
pand xmm1, [FP16_exp_adjust_for_Zero] | |
pcmpeqd xmm1, [FP16_FP32_exp_adj] | |
pand xmm1, [edx + eax] | |
; Subtract off the implicit 1 if we had a denorm, make the value | |
; zero if it should be zero. Unfortunately, negative zero become positive | |
; so we have to put the sign back. | |
; | |
subps xmm0, xmm1 | |
; Find the values < 2^(-15) (Denorms) | |
movaps xmm2, [FP32_no_sign] | |
andps xmm2, xmm0 | |
cmpleps xmm2, [FP16_FP32_denorm] | |
andps xmm2, [FP16_FP32_denorm_adj] | |
paddd xmm0, xmm2 | |
; Get the zeros back | |
movdqa xmm2, xmm0 | |
pcmpeqd xmm0, [FP16_FP32_denorm_adj] | |
andnps xmm0, xmm2 | |
; Put the sign bits back | |
pand xmm1, [FP32_sign_bit] | |
por xmm0, xmm1 | |
; All done! | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment