aboutsummaryrefslogtreecommitdiff
path: root/externals/dynarmic/tests/rsqrt_test_fn.s
diff options
context:
space:
mode:
Diffstat (limited to 'externals/dynarmic/tests/rsqrt_test_fn.s')
-rw-r--r--externals/dynarmic/tests/rsqrt_test_fn.s303
1 files changed, 303 insertions, 0 deletions
diff --git a/externals/dynarmic/tests/rsqrt_test_fn.s b/externals/dynarmic/tests/rsqrt_test_fn.s
new file mode 100644
index 0000000000..fa8dadee01
--- /dev/null
+++ b/externals/dynarmic/tests/rsqrt_test_fn.s
@@ -0,0 +1,303 @@
+.global _rsqrt_inaccurate
+.global rsqrt_inaccurate
+.global _rsqrt_full
+.global rsqrt_full
+.global _rsqrt_full_gpr
+.global rsqrt_full_gpr
+.global _rsqrt_full_nb
+.global rsqrt_full_nb
+.global _rsqrt_full_nb2
+.global rsqrt_full_nb2
+.global _rsqrt_full_nb_gpr
+.global rsqrt_full_nb_gpr
+.global _rsqrt_newton
+.global rsqrt_newton
+.global _rsqrt_hack
+.global rsqrt_hack
+.global _rsqrt_fallback
+
+.text
+.intel_syntax noprefix
+
+.align 16
+min_pos_denorm:
+.long 0x00800000,0,0,0
+penultimate_bit:
+.long 0x00008000,0,0,0
+ultimate_bit:
+.long 0x00004000,0,0,0
+top_mask:
+.long 0xFFFF8000,0,0,0
+one:
+.long 0x3f800000,0,0,0
+half:
+.long 0x3f000000,0,0,0
+one_point_five:
+.long 0x3fc00000,0,0,0
+magic1:
+.long 0x60000000,0,0,0
+magic2:
+.long 0x3c000000,0,0,0
+magic3:
+.long 0x000047ff,0,0,0
+
+_rsqrt_inaccurate:
+rsqrt_inaccurate:
+ movd xmm0, edi
+
+ rsqrtss xmm0, xmm0
+
+ movd eax, xmm0
+ ret
+
+_rsqrt_full:
+rsqrt_full:
+ movd xmm0, edi
+
+ pand xmm0, [rip + top_mask]
+ por xmm0, [rip + penultimate_bit]
+
+ vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
+ ptest xmm1, xmm1
+ jnz rsqrt_full_bad
+
+ sqrtss xmm0, xmm0
+
+ movd xmm1, [rip + one]
+ divss xmm1, xmm0
+
+ paddd xmm1, [rip + ultimate_bit]
+ pand xmm1, [rip + top_mask]
+
+ movd eax, xmm1
+ ret
+
+_rsqrt_full_gpr:
+rsqrt_full_gpr:
+ movd eax, xmm0 # Emulate regalloc mov
+
+ mov eax, edi
+ and eax, 0xFFFF8000
+ or eax, 0x00008000
+
+ movd xmm0, eax
+ vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
+ ptest xmm1, xmm1
+ jnz rsqrt_full_bad
+
+ sqrtss xmm0, xmm0
+
+ movd xmm1, [rip + one]
+ divss xmm1, xmm0
+ movd eax, xmm1
+
+ add eax, 0x00004000
+ and eax, 0xffff8000
+
+ movd xmm0, eax # Emulate regalloc mov
+ ret
+
+_rsqrt_full_nb2:
+rsqrt_full_nb2:
+ movd xmm0, edi
+
+ pand xmm0, [rip + top_mask]
+ por xmm0, [rip + penultimate_bit]
+
+ ucomiss xmm0, [rip + min_pos_denorm]
+ jna rsqrt_full_bad_new1
+
+ sqrtss xmm0, xmm0
+
+ movd xmm1, [rip + one]
+ divss xmm1, xmm0
+
+ paddd xmm1, [rip + ultimate_bit]
+ pand xmm1, [rip + top_mask]
+
+ movd eax, xmm1
+ ret
+
+_rsqrt_full_nb:
+rsqrt_full_nb:
+ movd xmm0, edi
+
+ pand xmm0, [rip + top_mask]
+ por xmm0, [rip + penultimate_bit]
+
+ vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
+ ptest xmm1, xmm1
+ jnz rsqrt_full_bad_new1
+
+ sqrtss xmm0, xmm0
+
+ movd xmm1, [rip + one]
+ divss xmm1, xmm0
+
+ paddd xmm1, [rip + ultimate_bit]
+ pand xmm1, [rip + top_mask]
+
+ movd eax, xmm1
+ ret
+
+rsqrt_full_bad_new1:
+ cmp edi, 0x00800000
+ jb rsqrt_full_bad_new_fallback1
+
+ movd xmm0, edi
+ rsqrtss xmm1, xmm0
+
+ ucomiss xmm1, xmm1
+ jp rsqrt_full_bad_new1_nan
+
+ movd eax, xmm1
+ ret
+
+rsqrt_full_bad_new_fallback1:
+ call _rsqrt_fallback
+ ret
+
+rsqrt_full_bad_new1_nan:
+ ucomiss xmm0, xmm0
+ jp rsqrt_full_bad_new1_nan_ret
+
+ mov eax, 0x7FC00000
+ ret
+
+rsqrt_full_bad_new1_nan_ret:
+ ret
+
+_rsqrt_full_nb_gpr:
+rsqrt_full_nb_gpr:
+ movd eax, xmm0 # Emulate regalloc mov
+
+ mov eax, edi
+ and eax, 0xFFFF8000
+ or eax, 0x00008000
+
+ movd xmm0, eax
+ vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
+ ptest xmm1, xmm1
+ jnz rsqrt_full_bad_new2
+
+ sqrtss xmm0, xmm0
+
+ movd xmm1, [rip + one]
+ divss xmm1, xmm0
+ movd eax, xmm1
+
+ add eax, 0x00004000
+ and eax, 0xffff8000
+
+ movd xmm0, eax # Emulate regalloc mov
+ ret
+
+rsqrt_full_bad_new2:
+ cmp edi, 0x00800000
+ jb rsqrt_full_bad_new_fallback2
+
+ movd xmm0, edi
+ rsqrtss xmm1, xmm0
+
+ test edi, edi
+ js rsqrt_full_bad_new2_nan
+
+ movd eax, xmm1
+ ret
+
+rsqrt_full_bad_new_fallback2:
+ call _rsqrt_fallback
+ ret
+
+rsqrt_full_bad_new2_nan:
+ mov eax, 0x7FC00000
+ ret
+
+rsqrt_full_bad:
+ xorps xmm1, xmm1
+ movd xmm0, edi
+ ucomiss xmm0, xmm1
+ jp rsqrt_full_nan
+ je rsqrt_full_zero
+ jc rsqrt_full_neg
+
+ cmp edi, 0x7F800000
+ je rsqrt_full_inf
+
+ # TODO: Full Denormal Implementation
+ call _rsqrt_fallback
+ ret
+
+rsqrt_full_neg:
+ mov eax, 0x7FC00000
+ ret
+
+rsqrt_full_inf:
+ xor eax, eax
+ ret
+
+rsqrt_full_nan:
+ mov eax, edi
+ or eax, 0x00400000
+ ret
+
+rsqrt_full_zero:
+ mov eax, edi
+ or eax, 0x7F800000
+ ret
+
+_rsqrt_newton:
+rsqrt_newton:
+ movd xmm0, edi
+
+ pand xmm0, [rip + top_mask]
+ por xmm0, [rip + penultimate_bit]
+
+ vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
+ ptest xmm1, xmm1
+ jnz rsqrt_full_bad
+
+ rsqrtps xmm1, xmm0
+ mulss xmm0, [rip + half]
+ vmulss xmm2, xmm1, xmm1
+ mulss xmm2, xmm0
+ movaps xmm0, [rip + one_point_five]
+ subss xmm0, xmm2
+ mulss xmm0, xmm1
+
+ paddd xmm0, [rip + ultimate_bit]
+ pand xmm0, [rip + top_mask]
+
+ movd eax, xmm0
+ ret
+
+_rsqrt_hack:
+rsqrt_hack:
+ movd xmm9, edi
+
+ vpand xmm0, xmm9, [rip + top_mask]
+ por xmm0, [rip + penultimate_bit]
+
+ # detect NaNs, negatives, zeros, denormals and infinities
+ vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
+ ptest xmm1, xmm1
+ jnz rsqrt_full_bad
+
+ # calculate x64 estimate
+ rsqrtps xmm0, xmm0
+
+ # calculate correction factor
+ vpslld xmm1, xmm9, 8
+ vpsrad xmm2, xmm1, 31
+ paddd xmm1, [rip + magic1]
+ pcmpgtd xmm1, [rip + magic2]
+ pxor xmm1, xmm2
+ movaps xmm2, [rip + magic3]
+ psubd xmm2, xmm1
+
+ # correct x64 estimate
+ paddd xmm0, xmm2
+ pand xmm0, [rip + top_mask]
+
+ movd eax, xmm0
+ ret