diff options
Diffstat (limited to 'externals/dynarmic/tests/rsqrt_test_fn.s')
-rw-r--r-- | externals/dynarmic/tests/rsqrt_test_fn.s | 303 |
1 files changed, 303 insertions, 0 deletions
diff --git a/externals/dynarmic/tests/rsqrt_test_fn.s b/externals/dynarmic/tests/rsqrt_test_fn.s new file mode 100644 index 0000000000..fa8dadee01 --- /dev/null +++ b/externals/dynarmic/tests/rsqrt_test_fn.s @@ -0,0 +1,303 @@ +.global _rsqrt_inaccurate +.global rsqrt_inaccurate +.global _rsqrt_full +.global rsqrt_full +.global _rsqrt_full_gpr +.global rsqrt_full_gpr +.global _rsqrt_full_nb +.global rsqrt_full_nb +.global _rsqrt_full_nb2 +.global rsqrt_full_nb2 +.global _rsqrt_full_nb_gpr +.global rsqrt_full_nb_gpr +.global _rsqrt_newton +.global rsqrt_newton +.global _rsqrt_hack +.global rsqrt_hack +.global _rsqrt_fallback + +.text +.intel_syntax noprefix + +.align 16 +min_pos_denorm: +.long 0x00800000,0,0,0 +penultimate_bit: +.long 0x00008000,0,0,0 +ultimate_bit: +.long 0x00004000,0,0,0 +top_mask: +.long 0xFFFF8000,0,0,0 +one: +.long 0x3f800000,0,0,0 +half: +.long 0x3f000000,0,0,0 +one_point_five: +.long 0x3fc00000,0,0,0 +magic1: +.long 0x60000000,0,0,0 +magic2: +.long 0x3c000000,0,0,0 +magic3: +.long 0x000047ff,0,0,0 + +_rsqrt_inaccurate: +rsqrt_inaccurate: + movd xmm0, edi + + rsqrtss xmm0, xmm0 + + movd eax, xmm0 + ret + +_rsqrt_full: +rsqrt_full: + movd xmm0, edi + + pand xmm0, [rip + top_mask] + por xmm0, [rip + penultimate_bit] + + vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] + ptest xmm1, xmm1 + jnz rsqrt_full_bad + + sqrtss xmm0, xmm0 + + movd xmm1, [rip + one] + divss xmm1, xmm0 + + paddd xmm1, [rip + ultimate_bit] + pand xmm1, [rip + top_mask] + + movd eax, xmm1 + ret + +_rsqrt_full_gpr: +rsqrt_full_gpr: + movd eax, xmm0 # Emulate regalloc mov + + mov eax, edi + and eax, 0xFFFF8000 + or eax, 0x00008000 + + movd xmm0, eax + vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] + ptest xmm1, xmm1 + jnz rsqrt_full_bad + + sqrtss xmm0, xmm0 + + movd xmm1, [rip + one] + divss xmm1, xmm0 + movd eax, xmm1 + + add eax, 0x00004000 + and eax, 0xffff8000 + + movd xmm0, eax # Emulate regalloc mov + ret + +_rsqrt_full_nb2: +rsqrt_full_nb2: + movd xmm0, edi + + pand xmm0, [rip + top_mask] + por xmm0, [rip + penultimate_bit] + + ucomiss xmm0, [rip + min_pos_denorm] + jna rsqrt_full_bad_new1 + + sqrtss xmm0, xmm0 + + movd xmm1, [rip + one] + divss xmm1, xmm0 + + paddd xmm1, [rip + ultimate_bit] + pand xmm1, [rip + top_mask] + + movd eax, xmm1 + ret + +_rsqrt_full_nb: +rsqrt_full_nb: + movd xmm0, edi + + pand xmm0, [rip + top_mask] + por xmm0, [rip + penultimate_bit] + + vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] + ptest xmm1, xmm1 + jnz rsqrt_full_bad_new1 + + sqrtss xmm0, xmm0 + + movd xmm1, [rip + one] + divss xmm1, xmm0 + + paddd xmm1, [rip + ultimate_bit] + pand xmm1, [rip + top_mask] + + movd eax, xmm1 + ret + +rsqrt_full_bad_new1: + cmp edi, 0x00800000 + jb rsqrt_full_bad_new_fallback1 + + movd xmm0, edi + rsqrtss xmm1, xmm0 + + ucomiss xmm1, xmm1 + jp rsqrt_full_bad_new1_nan + + movd eax, xmm1 + ret + +rsqrt_full_bad_new_fallback1: + call _rsqrt_fallback + ret + +rsqrt_full_bad_new1_nan: + ucomiss xmm0, xmm0 + jp rsqrt_full_bad_new1_nan_ret + + mov eax, 0x7FC00000 + ret + +rsqrt_full_bad_new1_nan_ret: + ret + +_rsqrt_full_nb_gpr: +rsqrt_full_nb_gpr: + movd eax, xmm0 # Emulate regalloc mov + + mov eax, edi + and eax, 0xFFFF8000 + or eax, 0x00008000 + + movd xmm0, eax + vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] + ptest xmm1, xmm1 + jnz rsqrt_full_bad_new2 + + sqrtss xmm0, xmm0 + + movd xmm1, [rip + one] + divss xmm1, xmm0 + movd eax, xmm1 + + add eax, 0x00004000 + and eax, 0xffff8000 + + movd xmm0, eax # Emulate regalloc mov + ret + +rsqrt_full_bad_new2: + cmp edi, 0x00800000 + jb rsqrt_full_bad_new_fallback2 + + movd xmm0, edi + rsqrtss xmm1, xmm0 + + test edi, edi + js rsqrt_full_bad_new2_nan + + movd eax, xmm1 + ret + +rsqrt_full_bad_new_fallback2: + call _rsqrt_fallback + ret + +rsqrt_full_bad_new2_nan: + mov eax, 0x7FC00000 + ret + +rsqrt_full_bad: + xorps xmm1, xmm1 + movd xmm0, edi + ucomiss xmm0, xmm1 + jp rsqrt_full_nan + je rsqrt_full_zero + jc rsqrt_full_neg + + cmp edi, 0x7F800000 + je rsqrt_full_inf + + # TODO: Full Denormal Implementation + call _rsqrt_fallback + ret + +rsqrt_full_neg: + mov eax, 0x7FC00000 + ret + +rsqrt_full_inf: + xor eax, eax + ret + +rsqrt_full_nan: + mov eax, edi + or eax, 0x00400000 + ret + +rsqrt_full_zero: + mov eax, edi + or eax, 0x7F800000 + ret + +_rsqrt_newton: +rsqrt_newton: + movd xmm0, edi + + pand xmm0, [rip + top_mask] + por xmm0, [rip + penultimate_bit] + + vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] + ptest xmm1, xmm1 + jnz rsqrt_full_bad + + rsqrtps xmm1, xmm0 + mulss xmm0, [rip + half] + vmulss xmm2, xmm1, xmm1 + mulss xmm2, xmm0 + movaps xmm0, [rip + one_point_five] + subss xmm0, xmm2 + mulss xmm0, xmm1 + + paddd xmm0, [rip + ultimate_bit] + pand xmm0, [rip + top_mask] + + movd eax, xmm0 + ret + +_rsqrt_hack: +rsqrt_hack: + movd xmm9, edi + + vpand xmm0, xmm9, [rip + top_mask] + por xmm0, [rip + penultimate_bit] + + # detect NaNs, negatives, zeros, denormals and infinities + vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] + ptest xmm1, xmm1 + jnz rsqrt_full_bad + + # calculate x64 estimate + rsqrtps xmm0, xmm0 + + # calculate correction factor + vpslld xmm1, xmm9, 8 + vpsrad xmm2, xmm1, 31 + paddd xmm1, [rip + magic1] + pcmpgtd xmm1, [rip + magic2] + pxor xmm1, xmm2 + movaps xmm2, [rip + magic3] + psubd xmm2, xmm1 + + # correct x64 estimate + paddd xmm0, xmm2 + pand xmm0, [rip + top_mask] + + movd eax, xmm0 + ret |